[llvm] 6dbc01e - [AMDGPU][True16][CodeGen] update GFX11Plus codegen test with true16 flag (#135078)

via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 23 10:07:19 PDT 2025


Author: Brox Chen
Date: 2025-04-23T13:06:52-04:00
New Revision: 6dbc01e8015816e904687c03f0ea8afac817781d

URL: https://github.com/llvm/llvm-project/commit/6dbc01e8015816e904687c03f0ea8afac817781d
DIFF: https://github.com/llvm/llvm-project/commit/6dbc01e8015816e904687c03f0ea8afac817781d.diff

LOG: [AMDGPU][True16][CodeGen] update GFX11Plus codegen test with true16 flag (#135078)

This is a NFC patch.

This patch run a bulk update on CodeGen tests that are impacted by the
true16 features. This patch applies:
1. duplicate GFX11plus runlines and apply them with
"+mattr=+real-true16" and "+mattr=-real-true16"
2. update the test with the update script

For some GISEL runlines, the current CodeGen do not fully support the
true16 version. Still update the runlines, but comment out the failing
one, and added a "FIXME-TRUE16" comment to that test for easier
tracking. These test will be fixed in the following patches.

This is in a transition state that we support both
"+real-true16/-real-true16" in our code base. We plan to move to
"+real-true16" as default, and finally remove "-real-true16" mode and
test lines.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
    llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
    llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
    llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
    llvm/test/CodeGen/AMDGPU/bitreverse.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/calling-conventions.ll
    llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
    llvm/test/CodeGen/AMDGPU/clamp.ll
    llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
    llvm/test/CodeGen/AMDGPU/ctlz.ll
    llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
    llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
    llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
    llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
    llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
    llvm/test/CodeGen/AMDGPU/fmax3.ll
    llvm/test/CodeGen/AMDGPU/fmaximum.ll
    llvm/test/CodeGen/AMDGPU/fmaximum3.ll
    llvm/test/CodeGen/AMDGPU/fmin3.ll
    llvm/test/CodeGen/AMDGPU/fminimum.ll
    llvm/test/CodeGen/AMDGPU/fminimum3.ll
    llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
    llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
    llvm/test/CodeGen/AMDGPU/fnearbyint.ll
    llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
    llvm/test/CodeGen/AMDGPU/fneg.ll
    llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
    llvm/test/CodeGen/AMDGPU/fpext-free.ll
    llvm/test/CodeGen/AMDGPU/fpow.ll
    llvm/test/CodeGen/AMDGPU/fract-match.ll
    llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/test/CodeGen/AMDGPU/freeze.ll
    llvm/test/CodeGen/AMDGPU/frem.ll
    llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
    llvm/test/CodeGen/AMDGPU/function-args.ll
    llvm/test/CodeGen/AMDGPU/function-returns.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
    llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
    llvm/test/CodeGen/AMDGPU/half.ll
    llvm/test/CodeGen/AMDGPU/idot4s.ll
    llvm/test/CodeGen/AMDGPU/idot4u.ll
    llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
    llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
    llvm/test/CodeGen/AMDGPU/llvm.log.ll
    llvm/test/CodeGen/AMDGPU/llvm.log10.ll
    llvm/test/CodeGen/AMDGPU/llvm.log2.ll
    llvm/test/CodeGen/AMDGPU/llvm.powi.ll
    llvm/test/CodeGen/AMDGPU/llvm.round.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
    llvm/test/CodeGen/AMDGPU/lrint.ll
    llvm/test/CodeGen/AMDGPU/lround.ll
    llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
    llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
    llvm/test/CodeGen/AMDGPU/mad-mix.ll
    llvm/test/CodeGen/AMDGPU/maximumnum.ll
    llvm/test/CodeGen/AMDGPU/min.ll
    llvm/test/CodeGen/AMDGPU/minimumnum.ll
    llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
    llvm/test/CodeGen/AMDGPU/offset-split-global.ll
    llvm/test/CodeGen/AMDGPU/omod.ll
    llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
    llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
    llvm/test/CodeGen/AMDGPU/rotl.ll
    llvm/test/CodeGen/AMDGPU/rotr.ll
    llvm/test/CodeGen/AMDGPU/roundeven.ll
    llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
    llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
    llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
    llvm/test/CodeGen/AMDGPU/strict_fpext.ll
    llvm/test/CodeGen/AMDGPU/sub.ll
    llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
    llvm/test/CodeGen/AMDGPU/v_cndmask.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
    llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
    llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
    llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
    llvm/test/CodeGen/AMDGPU/vopc_dpp.mir
    llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
    llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 56edb29281944..2d19f9702e6ba 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v32i32_to_v32f32:
@@ -4447,693 +4448,1258 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i32_to_v128i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:88
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:84
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:80
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:76
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:72
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:68
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:64
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:60
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB6_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:  .LBB6_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB6_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v20, 3, v20
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 3, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v31, 3, v31
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v30, 3, v30
-; GFX11-NEXT:    v_add_nc_u32_e32 v29, 3, v29
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v28, 3, v28
-; GFX11-NEXT:    v_add_nc_u32_e32 v27, 3, v27
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v26, 3, v26
-; GFX11-NEXT:    v_add_nc_u32_e32 v25, 3, v25
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v24, 3, v24
-; GFX11-NEXT:    v_add_nc_u32_e32 v23, 3, v23
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v22, 3, v22
-; GFX11-NEXT:    v_add_nc_u32_e32 v21, 3, v21
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:  .LBB6_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v75
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v63
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v65
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xff, v61
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v39, v55, v39
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v73
-; GFX11-NEXT:    v_or_b32_e32 v65, v67, v65
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v58
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v72
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v39
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v55, v55, v66
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v62
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v60
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v59
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v67
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v57
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v56
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v65
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v64
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v47
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v46
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v45
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v39
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v64
-; GFX11-NEXT:    v_or_b32_e32 v55, v65, v66
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v44
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v43
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v54, v65, v54
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v42
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v41
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v40
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v55
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v54, v64, v65
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v182
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v183
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v181
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v53, v64, v53
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v55
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v65
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v39
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xff, v180
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v179
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v178
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v177
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v176
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v167
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v166
-; GFX11-NEXT:    v_or_b32_e32 v53, v53, v54
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v55
-; GFX11-NEXT:    v_or_b32_e32 v52, v64, v52
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v65
-; GFX11-NEXT:    v_or_b32_e32 v54, v66, v67
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    v_or_b32_e32 v1, v9, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v10, v53
-; GFX11-NEXT:    v_or_b32_e32 v3, v11, v52
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v54
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v165
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v164
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v163
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v162
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v161
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v160
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v151
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v150
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v149
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v148
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v147
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v146
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v39, v49
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v145
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v144
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v135
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v134
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v132
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v131
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v130
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v21
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v129
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v128
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v119
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v118
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v117
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v38, v39
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v116
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v115
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v37
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v114
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v113
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v103
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v102
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v101
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v100
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v99
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v98
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v97
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v35
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v36, v35
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v96
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v87
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v86
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v84
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v83
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v82
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v81
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b16 v28, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_lshlrev_b16 v30, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v70
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v69
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v68
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v26, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v27, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v28, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v29, v33, v34
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:88
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i32_to_v128i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB6_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB6_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB6_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v20, 3, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v31, 3, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v30, 3, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v29, 3, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v28, 3, v28
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 3, v26
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v25, 3, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v24, 3, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v23, 3, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v22, 3, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v21, 3, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB6_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB6_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:  .LBB6_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB6_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 3, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 3, v32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v31, 3, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v30, 3, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v29, 3, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v28, 3, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v27, 3, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v26, 3, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v25, 3, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v24, 3, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v23, 3, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v22, 3, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v21, 3, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB6_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v63
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v67, 0xff, v61
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, v55, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, v67, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v72
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v55, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v62
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v47
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v46
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v65, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v65, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v40
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v64, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v181
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v64, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v166
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v53, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v64, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v66, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v160
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v148
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v39, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v144
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v132
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v38, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v115
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v36, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v81
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v33, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -9225,1038 +9791,1985 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v128i8_to_v32i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:592
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:588
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:584
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:580
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:576
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:572
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:568
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:564
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:560
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:556
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:552
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:548
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:544
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:540
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:536
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:532
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:528
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:524
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:520
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:516
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:512
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:508
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:504
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:500
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:496
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:492
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:488
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:484
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:480
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:476
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:472
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:464
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:460
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:456
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:452
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:448
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:444
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:440
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:436
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:432
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:428
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:424
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:420
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:416
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:412
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:408
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:404
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:400
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:396
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:392
-; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-NEXT:    scratch_load_u16 v115, off, s32
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_u16 v136, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_u16 v137, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_u16 v138, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_u16 v132, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_u16 v134, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_u16 v150, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_u16 v160, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_u16 v161, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_u16 v167, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_u16 v176, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v177, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v41, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v42, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v124, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v125, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v126, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v127, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v111, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v120, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v121, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v122, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v123, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v106, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v107, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v108, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v109, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v110, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v164, 8, v138
-; GFX11-NEXT:    v_lshlrev_b16 v165, 8, v103
-; GFX11-NEXT:    v_lshlrev_b16 v166, 8, v102
-; GFX11-NEXT:    v_lshlrev_b16 v144, 8, v101
-; GFX11-NEXT:    v_lshlrev_b16 v145, 8, v100
-; GFX11-NEXT:    v_lshlrev_b16 v146, 8, v99
-; GFX11-NEXT:    v_lshlrev_b16 v147, 8, v31
-; GFX11-NEXT:    v_lshlrev_b16 v148, 8, v30
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v28
-; GFX11-NEXT:    v_lshlrev_b16 v128, 8, v26
-; GFX11-NEXT:    v_lshlrev_b16 v129, 8, v24
-; GFX11-NEXT:    v_lshlrev_b16 v130, 8, v22
-; GFX11-NEXT:    v_lshlrev_b16 v131, 8, v20
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v18
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v16
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v14
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v12
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v0
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB7_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v51
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v124
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v125
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v126
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v127
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v37
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v111
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v121
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v120
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v122
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v123
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v107
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v108
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v109
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v110
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v106
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v6, v12
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v92
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v78
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v77
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v76
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v75
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v60
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v59
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v93
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v94
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v95
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v104
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v105
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v79
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v88
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v89
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v90
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v91
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v58
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v44
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v43
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v42
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v41
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v40
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v178
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v177
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v176
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v167
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v61
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v62
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v63
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v72
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v73
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v45
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v46
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v47
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v56
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v57
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v161
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v160
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v151
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v150
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v149
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v135
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v134
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v133
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v132
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v113
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v179
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v180
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v181
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v182
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v183
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v162
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v163
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v164
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v165
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v166
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v98
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v96
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v84
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v83
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v82
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v144
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v145
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v146
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v147
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v148
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v119
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v128
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v129
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v130
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v131
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v66
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v64
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v114
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v115
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v116
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v117
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v118
-; GFX11-NEXT:    v_or_b32_e32 v32, v32, v99
-; GFX11-NEXT:    v_or_b32_e32 v33, v33, v100
-; GFX11-NEXT:    v_or_b32_e32 v34, v34, v101
-; GFX11-NEXT:    v_or_b32_e32 v35, v35, v102
-; GFX11-NEXT:    v_or_b32_e32 v36, v36, v103
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr92
-; GFX11-NEXT:    ; implicit-def: $vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr77
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr124
-; GFX11-NEXT:    ; implicit-def: $vgpr125
-; GFX11-NEXT:    ; implicit-def: $vgpr126
-; GFX11-NEXT:    ; implicit-def: $vgpr127
-; GFX11-NEXT:    ; implicit-def: $vgpr111
-; GFX11-NEXT:    ; implicit-def: $vgpr120
-; GFX11-NEXT:    ; implicit-def: $vgpr121
-; GFX11-NEXT:    ; implicit-def: $vgpr122
-; GFX11-NEXT:    ; implicit-def: $vgpr123
-; GFX11-NEXT:    ; implicit-def: $vgpr106
-; GFX11-NEXT:    ; implicit-def: $vgpr107
-; GFX11-NEXT:    ; implicit-def: $vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr109
-; GFX11-NEXT:    ; implicit-def: $vgpr110
-; GFX11-NEXT:    ; implicit-def: $vgpr93
-; GFX11-NEXT:    ; implicit-def: $vgpr94
-; GFX11-NEXT:    ; implicit-def: $vgpr95
-; GFX11-NEXT:    ; implicit-def: $vgpr104
-; GFX11-NEXT:    ; implicit-def: $vgpr105
-; GFX11-NEXT:    ; implicit-def: $vgpr79
-; GFX11-NEXT:    ; implicit-def: $vgpr88
-; GFX11-NEXT:    ; implicit-def: $vgpr89
-; GFX11-NEXT:    ; implicit-def: $vgpr90
-; GFX11-NEXT:    ; implicit-def: $vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:  .LBB7_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB7_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v51, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v5, v50, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v124, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v125, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v126, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v127, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v7, v48, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u16 v8, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v36, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v10, v35, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v2, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v123, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_or_b32_e32 v3, v111, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v120, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v121, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, v122, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v107, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v108, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v109, v10
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v38, 3
-; GFX11-NEXT:    v_or_b32_e32 v11, v110, v11
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v106, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_add_nc_u16 v7, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v32, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v9, v92, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v78, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v77, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v76, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v75, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v74, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v60, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v59, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v93, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v94, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v95, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v104, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v105, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v79, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v88, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v89, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v90, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v91, v16
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_add_nc_u16 v12, v58, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v44, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v43, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v42, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v41, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v40, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v178, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v177, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v176, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v167, 3
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v62, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v63, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v72, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v73, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v45, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v46, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v47, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v56, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v57, v21
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_add_nc_u16 v17, v161, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v160, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v151, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v150, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v149, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v135, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v134, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v133, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v132, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v113, 3
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v180, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v181, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v182, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v183, v21
-; GFX11-NEXT:    v_or_b32_e32 v22, v162, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v163, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v164, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v165, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v166, v26
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_add_nc_u16 v22, v112, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v98, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v97, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v96, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v87, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v84, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v83, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v82, 3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v145, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v146, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v147, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v148, v26
-; GFX11-NEXT:    v_or_b32_e32 v27, v119, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v128, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v129, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v130, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v131, v31
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_add_nc_u16 v27, v81, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v32, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v33, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v34, v66, 3
-; GFX11-NEXT:    v_add_nc_u16 v35, v65, 3
-; GFX11-NEXT:    v_add_nc_u16 v36, v64, 3
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v114, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v115, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v116, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v117, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v118, v31
-; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-NEXT:    v_or_b32_e32 v33, v100, v33
-; GFX11-NEXT:    v_or_b32_e32 v34, v101, v34
-; GFX11-NEXT:    v_or_b32_e32 v35, v102, v35
-; GFX11-NEXT:    v_or_b32_e32 v36, v103, v36
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_add_nc_u16 v32, 0x300, v32
-; GFX11-NEXT:    v_add_nc_u16 v33, 0x300, v33
-; GFX11-NEXT:    v_add_nc_u16 v34, 0x300, v34
-; GFX11-NEXT:    v_add_nc_u16 v35, 0x300, v35
-; GFX11-NEXT:    v_add_nc_u16 v36, 0x300, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:  .LBB7_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:392
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:396
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:400
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:404
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:408
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:412
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:416
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:420
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:424
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:428
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:432
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:436
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:440
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:444
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:448
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:452
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:456
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:460
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:464
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:468
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:472
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:476
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:480
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:484
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:488
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:492
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:496
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:500
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:504
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:508
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:512
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:520
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:524
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:528
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:532
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:536
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:540
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:544
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:548
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:552
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:556
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:560
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:564
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:568
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:572
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:576
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:580
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:584
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:588
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:592
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB7_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB7_4
+; GFX11-TRUE16-NEXT:  .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB7_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT:  .LBB7_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:516
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v160, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v161, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v167, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v176, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v177, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v144, 8, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v145, 8, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v146, 8, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v124
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v125
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v126
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v127
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v111
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v121
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v120
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v122
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v123
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v107
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v108
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v109
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v110
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v106
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v6, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v92
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v78
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v77
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v76
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v93
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v94
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v95
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v104
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v105
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v79
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v88
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v89
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v90
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v91
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v40
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v61
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v62
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v72
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v47
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v160
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v132
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v179
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v183
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v162
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v163
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v164
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v165
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v144
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v145
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v146
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v147
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v148
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v119
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v128
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v129
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v130
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v32, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v33, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v34, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v35, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr124
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr125
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr126
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr127
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr111
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr120
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr121
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr122
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr106
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr107
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr109
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr93
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr95
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr105
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:  .LBB7_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB7_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v51, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v50, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v124, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v125, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v126, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v127, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v48, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v36, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v35, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v111, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v120, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v121, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v122, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v107, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v108, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v109, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v38, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v110, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v106, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v32, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v92, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v78, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v77, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v76, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v75, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v74, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v60, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v59, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v93, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v94, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v95, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v104, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v105, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v79, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v88, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v89, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v90, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v91, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v58, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v44, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v43, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v42, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v41, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v40, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v178, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v177, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v176, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v62, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v63, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v72, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v73, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v45, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v46, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v47, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v56, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v57, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v160, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v151, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v150, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v149, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v135, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v134, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v133, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v132, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v180, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v181, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v182, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v183, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v162, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v163, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v164, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v165, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v112, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v98, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v97, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v96, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v87, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v84, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v83, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v145, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v146, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v147, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v148, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v119, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v128, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v129, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v130, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v81, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, v66, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, v65, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, v64, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v114, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v115, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v116, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v117, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v118, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v100, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v101, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v102, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, 0x300, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, 0x300, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, 0x300, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, 0x300, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, 0x300, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:  .LBB7_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -12941,550 +14454,1113 @@ define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v32i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v32
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB9_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v32, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v39, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v36, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v34, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v35, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v35, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
-; GFX11-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_bfe_u32 v34, v4, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v4
-; GFX11-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v35, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v34, v0, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v0
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v31
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v29
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_bfe_u32 v34, v28, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v28
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v27
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_bfe_u32 v35, v26, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v25
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v34, v24, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v24
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_bfe_u32 v35, v22, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v22
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v36, v34, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v21, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v21
-; GFX11-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
-; GFX11-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_bfe_u32 v38, v33, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v34, v20, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
-; GFX11-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_bfe_u32 v48, v18, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v35
-; GFX11-NEXT:    v_bfe_u32 v39, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v48, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v36
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v16
-; GFX11-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_bfe_u32 v37, v16, 16, 1
-; GFX11-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
-; GFX11-NEXT:  .LBB9_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB9_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v39, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v32.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v32, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v48, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v38, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v32, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v33
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v32
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v34
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v31, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v30, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v29.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v29, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v27, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v26, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v25, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v24, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v23, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v21, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v33, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v36, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v39, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v20, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v19, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v50, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v18, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v17, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v36, v16
+; GFX11-TRUE16-NEXT:  .LBB9_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB9_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v33, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB9_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -20801,676 +22877,1224 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32f32_to_v128i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:88
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:84
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:80
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:76
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:72
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:68
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:64
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:60
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB18_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:  .LBB18_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB18_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19
-; GFX11-NEXT:    v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31
-; GFX11-NEXT:    v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT:    v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29
-; GFX11-NEXT:    v_dual_add_f32 v30, 1.0, v30 :: v_dual_add_f32 v25, 1.0, v25
-; GFX11-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT:    v_dual_add_f32 v28, 1.0, v28 :: v_dual_add_f32 v27, 1.0, v27
-; GFX11-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
-; GFX11-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11
-; GFX11-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3
-; GFX11-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v9, 1.0, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1
-; GFX11-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v7, 1.0, v7
-; GFX11-NEXT:    v_dual_add_f32 v22, 1.0, v22 :: v_dual_add_f32 v21, 1.0, v21
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:  .LBB18_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v75
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v63
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v65
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xff, v61
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v39, v55, v39
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v73
-; GFX11-NEXT:    v_or_b32_e32 v65, v67, v65
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v58
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v72
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v39
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v55, v55, v66
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v62
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v60
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v59
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v67
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v57
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v56
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v65
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v64
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v47
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v46
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v45
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v39
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v64
-; GFX11-NEXT:    v_or_b32_e32 v55, v65, v66
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v44
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v43
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v54, v65, v54
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v42
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v41
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v40
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v55
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v54, v64, v65
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v182
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v183
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v181
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v53, v64, v53
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v55
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v65
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v39
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xff, v180
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v179
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v178
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v177
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v176
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v167
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v166
-; GFX11-NEXT:    v_or_b32_e32 v53, v53, v54
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v55
-; GFX11-NEXT:    v_or_b32_e32 v52, v64, v52
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v65
-; GFX11-NEXT:    v_or_b32_e32 v54, v66, v67
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    v_or_b32_e32 v1, v9, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v10, v53
-; GFX11-NEXT:    v_or_b32_e32 v3, v11, v52
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v54
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v165
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v164
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v163
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v162
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v161
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v160
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v151
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v150
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v149
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v148
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v147
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v146
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v39, v49
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v145
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v144
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v135
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v134
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v132
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v131
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v130
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v21
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v129
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v128
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v119
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v118
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v117
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v38, v39
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v116
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v115
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v37
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v114
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v113
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v103
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v102
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v101
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v100
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v99
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v98
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v97
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v35
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v36, v35
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v96
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v87
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v86
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v84
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v83
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v82
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v81
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b16 v28, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_lshlrev_b16 v30, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v70
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v69
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v68
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v26, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v27, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v28, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v29, v33, v34
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:88
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32f32_to_v128i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB18_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB18_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB18_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v30, 1.0, v30 :: v_dual_add_f32 v25, 1.0, v25
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v28, 1.0, v28 :: v_dual_add_f32 v27, 1.0, v27
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v9, 1.0, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v7, 1.0, v7
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 1.0, v22 :: v_dual_add_f32 v21, 1.0, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB18_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB18_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:  .LBB18_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB18_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v30, 1.0, v30 :: v_dual_add_f32 v25, 1.0, v25
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v28, 1.0, v28 :: v_dual_add_f32 v27, 1.0, v27
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v9, 1.0, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v7, 1.0, v7
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v22, 1.0, v22 :: v_dual_add_f32 v21, 1.0, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB18_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v63
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v67, 0xff, v61
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, v55, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, v67, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v72
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v55, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v62
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v47
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v46
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v65, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v65, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v40
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v64, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v181
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v64, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v166
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v53, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v64, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v66, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v160
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v148
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v39, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v144
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v132
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v38, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v115
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v36, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v81
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v33, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -25562,1038 +28186,1985 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v128i8_to_v32f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:592
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:588
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:584
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:580
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:576
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:572
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:568
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:564
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:560
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:556
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:552
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:548
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:544
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:540
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:536
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:532
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:528
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:524
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:520
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:516
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:512
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:508
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:504
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:500
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:496
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:492
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:488
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:484
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:480
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:476
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:472
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:464
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:460
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:456
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:452
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:448
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:444
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:440
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:436
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:432
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:428
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:424
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:420
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:416
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:412
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:408
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:404
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:400
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:396
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:392
-; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-NEXT:    scratch_load_u16 v115, off, s32
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_u16 v136, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_u16 v137, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_u16 v138, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_u16 v132, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_u16 v134, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_u16 v150, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_u16 v160, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_u16 v161, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_u16 v167, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_u16 v176, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v177, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v41, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v42, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v124, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v125, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v126, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v127, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v111, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v120, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v121, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v122, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v123, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v106, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v107, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v108, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v109, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v110, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v164, 8, v138
-; GFX11-NEXT:    v_lshlrev_b16 v165, 8, v103
-; GFX11-NEXT:    v_lshlrev_b16 v166, 8, v102
-; GFX11-NEXT:    v_lshlrev_b16 v144, 8, v101
-; GFX11-NEXT:    v_lshlrev_b16 v145, 8, v100
-; GFX11-NEXT:    v_lshlrev_b16 v146, 8, v99
-; GFX11-NEXT:    v_lshlrev_b16 v147, 8, v31
-; GFX11-NEXT:    v_lshlrev_b16 v148, 8, v30
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v28
-; GFX11-NEXT:    v_lshlrev_b16 v128, 8, v26
-; GFX11-NEXT:    v_lshlrev_b16 v129, 8, v24
-; GFX11-NEXT:    v_lshlrev_b16 v130, 8, v22
-; GFX11-NEXT:    v_lshlrev_b16 v131, 8, v20
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v18
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v16
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v14
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v12
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v0
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB19_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v51
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v124
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v125
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v126
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v127
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v37
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v111
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v121
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v120
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v122
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v123
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v107
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v108
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v109
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v110
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v106
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v6, v12
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v92
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v78
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v77
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v76
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v75
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v60
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v59
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v93
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v94
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v95
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v104
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v105
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v79
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v88
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v89
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v90
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v91
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v58
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v44
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v43
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v42
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v41
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v40
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v178
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v177
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v176
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v167
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v61
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v62
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v63
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v72
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v73
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v45
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v46
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v47
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v56
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v57
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v161
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v160
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v151
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v150
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v149
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v135
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v134
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v133
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v132
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v113
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v179
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v180
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v181
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v182
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v183
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v162
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v163
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v164
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v165
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v166
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v98
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v96
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v84
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v83
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v82
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v144
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v145
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v146
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v147
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v148
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v119
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v128
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v129
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v130
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v131
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v66
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v64
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v114
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v115
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v116
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v117
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v118
-; GFX11-NEXT:    v_or_b32_e32 v32, v32, v99
-; GFX11-NEXT:    v_or_b32_e32 v33, v33, v100
-; GFX11-NEXT:    v_or_b32_e32 v34, v34, v101
-; GFX11-NEXT:    v_or_b32_e32 v35, v35, v102
-; GFX11-NEXT:    v_or_b32_e32 v36, v36, v103
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr92
-; GFX11-NEXT:    ; implicit-def: $vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr77
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr124
-; GFX11-NEXT:    ; implicit-def: $vgpr125
-; GFX11-NEXT:    ; implicit-def: $vgpr126
-; GFX11-NEXT:    ; implicit-def: $vgpr127
-; GFX11-NEXT:    ; implicit-def: $vgpr111
-; GFX11-NEXT:    ; implicit-def: $vgpr120
-; GFX11-NEXT:    ; implicit-def: $vgpr121
-; GFX11-NEXT:    ; implicit-def: $vgpr122
-; GFX11-NEXT:    ; implicit-def: $vgpr123
-; GFX11-NEXT:    ; implicit-def: $vgpr106
-; GFX11-NEXT:    ; implicit-def: $vgpr107
-; GFX11-NEXT:    ; implicit-def: $vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr109
-; GFX11-NEXT:    ; implicit-def: $vgpr110
-; GFX11-NEXT:    ; implicit-def: $vgpr93
-; GFX11-NEXT:    ; implicit-def: $vgpr94
-; GFX11-NEXT:    ; implicit-def: $vgpr95
-; GFX11-NEXT:    ; implicit-def: $vgpr104
-; GFX11-NEXT:    ; implicit-def: $vgpr105
-; GFX11-NEXT:    ; implicit-def: $vgpr79
-; GFX11-NEXT:    ; implicit-def: $vgpr88
-; GFX11-NEXT:    ; implicit-def: $vgpr89
-; GFX11-NEXT:    ; implicit-def: $vgpr90
-; GFX11-NEXT:    ; implicit-def: $vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:  .LBB19_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB19_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v51, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v5, v50, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v124, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v125, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v126, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v127, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v7, v48, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u16 v8, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v36, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v10, v35, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v2, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v123, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_or_b32_e32 v3, v111, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v120, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v121, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, v122, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v107, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v108, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v109, v10
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v38, 3
-; GFX11-NEXT:    v_or_b32_e32 v11, v110, v11
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v106, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_add_nc_u16 v7, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v32, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v9, v92, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v78, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v77, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v76, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v75, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v74, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v60, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v59, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v93, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v94, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v95, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v104, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v105, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v79, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v88, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v89, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v90, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v91, v16
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_add_nc_u16 v12, v58, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v44, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v43, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v42, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v41, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v40, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v178, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v177, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v176, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v167, 3
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v62, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v63, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v72, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v73, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v45, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v46, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v47, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v56, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v57, v21
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_add_nc_u16 v17, v161, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v160, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v151, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v150, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v149, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v135, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v134, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v133, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v132, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v113, 3
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v180, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v181, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v182, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v183, v21
-; GFX11-NEXT:    v_or_b32_e32 v22, v162, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v163, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v164, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v165, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v166, v26
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_add_nc_u16 v22, v112, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v98, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v97, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v96, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v87, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v84, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v83, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v82, 3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v145, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v146, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v147, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v148, v26
-; GFX11-NEXT:    v_or_b32_e32 v27, v119, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v128, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v129, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v130, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v131, v31
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_add_nc_u16 v27, v81, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v32, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v33, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v34, v66, 3
-; GFX11-NEXT:    v_add_nc_u16 v35, v65, 3
-; GFX11-NEXT:    v_add_nc_u16 v36, v64, 3
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v114, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v115, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v116, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v117, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v118, v31
-; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-NEXT:    v_or_b32_e32 v33, v100, v33
-; GFX11-NEXT:    v_or_b32_e32 v34, v101, v34
-; GFX11-NEXT:    v_or_b32_e32 v35, v102, v35
-; GFX11-NEXT:    v_or_b32_e32 v36, v103, v36
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_add_nc_u16 v32, 0x300, v32
-; GFX11-NEXT:    v_add_nc_u16 v33, 0x300, v33
-; GFX11-NEXT:    v_add_nc_u16 v34, 0x300, v34
-; GFX11-NEXT:    v_add_nc_u16 v35, 0x300, v35
-; GFX11-NEXT:    v_add_nc_u16 v36, 0x300, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:  .LBB19_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:392
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:396
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:400
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:404
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:408
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:412
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:416
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:420
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:424
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:428
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:432
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:436
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:440
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:444
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:448
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:452
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:456
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:460
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:464
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:468
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:472
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:476
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:480
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:484
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:488
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:492
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:496
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:500
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:504
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:508
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:512
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:520
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:524
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:528
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:532
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:536
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:540
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:544
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:548
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:552
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:556
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:560
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:564
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:568
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:572
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:576
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:580
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:584
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:588
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:592
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v32f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB19_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB19_4
+; GFX11-TRUE16-NEXT:  .LBB19_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB19_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB19_2
+; GFX11-TRUE16-NEXT:  .LBB19_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v32f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:516
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v160, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v161, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v167, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v176, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v177, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v144, 8, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v145, 8, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v146, 8, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB19_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v124
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v125
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v126
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v127
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v111
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v121
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v120
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v122
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v123
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v107
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v108
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v109
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v110
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v106
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v6, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v92
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v78
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v77
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v76
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v93
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v94
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v95
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v104
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v105
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v79
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v88
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v89
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v90
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v91
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v40
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v61
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v62
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v72
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v47
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v160
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v132
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v179
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v183
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v162
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v163
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v164
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v165
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v144
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v145
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v146
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v147
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v148
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v119
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v128
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v129
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v130
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v32, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v33, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v34, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v35, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr124
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr125
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr126
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr127
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr111
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr120
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr121
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr122
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr106
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr107
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr109
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr93
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr95
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr105
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:  .LBB19_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB19_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v51, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v50, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v124, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v125, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v126, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v127, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v48, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v36, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v35, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v111, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v120, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v121, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v122, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v107, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v108, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v109, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v38, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v110, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v106, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v32, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v92, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v78, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v77, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v76, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v75, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v74, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v60, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v59, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v93, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v94, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v95, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v104, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v105, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v79, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v88, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v89, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v90, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v91, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v58, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v44, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v43, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v42, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v41, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v40, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v178, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v177, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v176, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v62, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v63, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v72, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v73, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v45, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v46, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v47, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v56, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v57, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v160, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v151, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v150, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v149, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v135, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v134, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v133, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v132, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v180, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v181, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v182, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v183, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v162, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v163, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v164, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v165, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v112, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v98, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v97, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v96, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v87, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v84, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v83, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v145, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v146, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v147, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v148, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v119, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v128, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v129, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v130, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v81, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, v66, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, v65, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, v64, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v114, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v115, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v116, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v117, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v118, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v100, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v101, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v102, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, 0x300, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, 0x300, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, 0x300, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, 0x300, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, 0x300, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:  .LBB19_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -29262,550 +32833,1113 @@ define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v32f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v32
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB21_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v32, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v39, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v36, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v34, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v35, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v35, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
-; GFX11-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_bfe_u32 v34, v4, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v4
-; GFX11-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v35, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v34, v0, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v0
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v31
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v29
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_bfe_u32 v34, v28, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v28
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v27
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_bfe_u32 v35, v26, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v25
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v34, v24, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v24
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_bfe_u32 v35, v22, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v22
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v36, v34, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v21, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v21
-; GFX11-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
-; GFX11-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_bfe_u32 v38, v33, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v34, v20, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
-; GFX11-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_bfe_u32 v48, v18, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v35
-; GFX11-NEXT:    v_bfe_u32 v39, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v48, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v36
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v16
-; GFX11-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_bfe_u32 v37, v16, 16, 1
-; GFX11-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
-; GFX11-NEXT:  .LBB21_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v32f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v39, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v32.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v32, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v48, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v38, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v32, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v33
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v32
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v34
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v31, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v30, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v29.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v29, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v27, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v26, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v25, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v24, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v23, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v21, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v33, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v36, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v39, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v20, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v19, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v50, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v18, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v17, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v36, v16
+; GFX11-TRUE16-NEXT:  .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v32f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v33, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -36682,701 +40816,1274 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i64_to_v128i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:88
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:84
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:80
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:76
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:72
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:68
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:64
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:60
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB28_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:  .LBB28_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB28_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v11, vcc_lo, v11, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v13, vcc_lo, v13, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v15, vcc_lo, v15, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v17, vcc_lo, v17, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v18, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v19, vcc_lo, v19, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v20, null, 0, v20, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v21, vcc_lo, v21, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v25, vcc_lo, v25, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v27, vcc_lo, v27, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v29, vcc_lo, v29, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v31, vcc_lo, v31, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v23, vcc_lo, v23, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:  .LBB28_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v75
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v63
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v65
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xff, v61
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v39, v55, v39
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v73
-; GFX11-NEXT:    v_or_b32_e32 v65, v67, v65
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v58
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v72
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v39
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v55, v55, v66
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v62
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v60
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v59
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v67
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v57
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v56
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v65
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v64
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v47
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v46
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v45
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v39
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v64
-; GFX11-NEXT:    v_or_b32_e32 v55, v65, v66
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v44
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v43
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v54, v65, v54
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v42
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v41
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v40
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v55
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v54, v64, v65
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v182
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v183
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v181
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v53, v64, v53
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v55
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v65
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v39
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xff, v180
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v179
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v178
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v177
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v176
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v167
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v166
-; GFX11-NEXT:    v_or_b32_e32 v53, v53, v54
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v55
-; GFX11-NEXT:    v_or_b32_e32 v52, v64, v52
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v65
-; GFX11-NEXT:    v_or_b32_e32 v54, v66, v67
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    v_or_b32_e32 v1, v9, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v10, v53
-; GFX11-NEXT:    v_or_b32_e32 v3, v11, v52
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v54
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v165
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v164
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v163
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v162
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v161
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v160
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v151
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v150
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v149
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v148
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v147
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v146
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v39, v49
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v145
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v144
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v135
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v134
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v132
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v131
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v130
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v21
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v129
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v128
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v119
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v118
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v117
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v38, v39
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v116
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v115
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v37
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v114
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v113
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v103
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v102
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v101
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v100
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v99
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v98
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v97
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v35
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v36, v35
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v96
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v87
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v86
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v84
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v83
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v82
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v81
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b16 v28, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_lshlrev_b16 v30, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v70
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v69
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v68
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v26, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v27, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v28, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v29, v33, v34
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:88
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i64_to_v128i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB28_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB28_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB28_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v11, vcc_lo, v11, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v13, vcc_lo, v13, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v15, vcc_lo, v15, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v17, vcc_lo, v17, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v19, vcc_lo, v19, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v20, null, 0, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v21, vcc_lo, v21, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v25, vcc_lo, v25, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v27, vcc_lo, v27, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v29, vcc_lo, v29, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v31, vcc_lo, v31, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v23, vcc_lo, v23, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB28_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:  .LBB28_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB28_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v11, vcc_lo, v11, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v13, vcc_lo, v13, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v15, vcc_lo, v15, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v17, vcc_lo, v17, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v18, null, 0, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v19, vcc_lo, v19, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v20, null, 0, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v21, vcc_lo, v21, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v25, vcc_lo, v25, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v27, vcc_lo, v27, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v29, vcc_lo, v29, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v31, vcc_lo, v31, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v23, vcc_lo, v23, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB28_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v63
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v67, 0xff, v61
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, v55, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, v67, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v72
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v55, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v62
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v47
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v46
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v65, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v65, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v40
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v64, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v181
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v64, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v166
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v53, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v64, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v66, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v160
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v148
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v39, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v144
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v132
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v38, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v115
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v36, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v81
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v33, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -41468,1038 +46175,1985 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v128i8_to_v16i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:592
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:588
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:584
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:580
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:576
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:572
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:568
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:564
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:560
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:556
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:552
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:548
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:544
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:540
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:536
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:532
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:528
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:524
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:520
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:516
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:512
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:508
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:504
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:500
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:496
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:492
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:488
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:484
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:480
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:476
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:472
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:464
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:460
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:456
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:452
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:448
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:444
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:440
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:436
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:432
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:428
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:424
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:420
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:416
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:412
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:408
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:404
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:400
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:396
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:392
-; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-NEXT:    scratch_load_u16 v115, off, s32
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_u16 v136, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_u16 v137, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_u16 v138, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_u16 v132, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_u16 v134, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_u16 v150, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_u16 v160, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_u16 v161, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_u16 v167, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_u16 v176, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v177, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v41, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v42, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v124, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v125, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v126, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v127, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v111, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v120, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v121, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v122, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v123, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v106, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v107, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v108, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v109, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v110, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v164, 8, v138
-; GFX11-NEXT:    v_lshlrev_b16 v165, 8, v103
-; GFX11-NEXT:    v_lshlrev_b16 v166, 8, v102
-; GFX11-NEXT:    v_lshlrev_b16 v144, 8, v101
-; GFX11-NEXT:    v_lshlrev_b16 v145, 8, v100
-; GFX11-NEXT:    v_lshlrev_b16 v146, 8, v99
-; GFX11-NEXT:    v_lshlrev_b16 v147, 8, v31
-; GFX11-NEXT:    v_lshlrev_b16 v148, 8, v30
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v28
-; GFX11-NEXT:    v_lshlrev_b16 v128, 8, v26
-; GFX11-NEXT:    v_lshlrev_b16 v129, 8, v24
-; GFX11-NEXT:    v_lshlrev_b16 v130, 8, v22
-; GFX11-NEXT:    v_lshlrev_b16 v131, 8, v20
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v18
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v16
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v14
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v12
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v0
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB29_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v51
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v124
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v125
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v126
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v127
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v37
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v111
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v121
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v120
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v122
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v123
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v107
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v108
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v109
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v110
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v106
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v6, v12
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v92
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v78
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v77
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v76
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v75
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v60
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v59
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v93
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v94
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v95
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v104
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v105
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v79
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v88
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v89
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v90
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v91
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v58
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v44
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v43
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v42
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v41
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v40
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v178
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v177
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v176
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v167
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v61
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v62
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v63
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v72
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v73
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v45
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v46
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v47
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v56
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v57
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v161
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v160
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v151
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v150
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v149
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v135
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v134
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v133
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v132
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v113
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v179
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v180
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v181
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v182
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v183
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v162
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v163
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v164
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v165
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v166
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v98
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v96
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v84
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v83
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v82
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v144
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v145
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v146
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v147
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v148
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v119
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v128
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v129
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v130
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v131
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v66
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v64
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v114
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v115
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v116
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v117
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v118
-; GFX11-NEXT:    v_or_b32_e32 v32, v32, v99
-; GFX11-NEXT:    v_or_b32_e32 v33, v33, v100
-; GFX11-NEXT:    v_or_b32_e32 v34, v34, v101
-; GFX11-NEXT:    v_or_b32_e32 v35, v35, v102
-; GFX11-NEXT:    v_or_b32_e32 v36, v36, v103
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr92
-; GFX11-NEXT:    ; implicit-def: $vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr77
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr124
-; GFX11-NEXT:    ; implicit-def: $vgpr125
-; GFX11-NEXT:    ; implicit-def: $vgpr126
-; GFX11-NEXT:    ; implicit-def: $vgpr127
-; GFX11-NEXT:    ; implicit-def: $vgpr111
-; GFX11-NEXT:    ; implicit-def: $vgpr120
-; GFX11-NEXT:    ; implicit-def: $vgpr121
-; GFX11-NEXT:    ; implicit-def: $vgpr122
-; GFX11-NEXT:    ; implicit-def: $vgpr123
-; GFX11-NEXT:    ; implicit-def: $vgpr106
-; GFX11-NEXT:    ; implicit-def: $vgpr107
-; GFX11-NEXT:    ; implicit-def: $vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr109
-; GFX11-NEXT:    ; implicit-def: $vgpr110
-; GFX11-NEXT:    ; implicit-def: $vgpr93
-; GFX11-NEXT:    ; implicit-def: $vgpr94
-; GFX11-NEXT:    ; implicit-def: $vgpr95
-; GFX11-NEXT:    ; implicit-def: $vgpr104
-; GFX11-NEXT:    ; implicit-def: $vgpr105
-; GFX11-NEXT:    ; implicit-def: $vgpr79
-; GFX11-NEXT:    ; implicit-def: $vgpr88
-; GFX11-NEXT:    ; implicit-def: $vgpr89
-; GFX11-NEXT:    ; implicit-def: $vgpr90
-; GFX11-NEXT:    ; implicit-def: $vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:  .LBB29_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB29_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v51, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v5, v50, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v124, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v125, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v126, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v127, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v7, v48, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u16 v8, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v36, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v10, v35, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v2, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v123, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_or_b32_e32 v3, v111, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v120, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v121, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, v122, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v107, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v108, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v109, v10
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v38, 3
-; GFX11-NEXT:    v_or_b32_e32 v11, v110, v11
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v106, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_add_nc_u16 v7, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v32, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v9, v92, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v78, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v77, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v76, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v75, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v74, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v60, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v59, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v93, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v94, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v95, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v104, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v105, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v79, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v88, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v89, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v90, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v91, v16
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_add_nc_u16 v12, v58, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v44, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v43, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v42, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v41, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v40, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v178, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v177, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v176, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v167, 3
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v62, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v63, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v72, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v73, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v45, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v46, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v47, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v56, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v57, v21
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_add_nc_u16 v17, v161, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v160, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v151, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v150, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v149, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v135, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v134, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v133, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v132, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v113, 3
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v180, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v181, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v182, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v183, v21
-; GFX11-NEXT:    v_or_b32_e32 v22, v162, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v163, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v164, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v165, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v166, v26
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_add_nc_u16 v22, v112, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v98, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v97, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v96, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v87, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v84, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v83, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v82, 3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v145, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v146, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v147, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v148, v26
-; GFX11-NEXT:    v_or_b32_e32 v27, v119, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v128, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v129, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v130, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v131, v31
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_add_nc_u16 v27, v81, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v32, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v33, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v34, v66, 3
-; GFX11-NEXT:    v_add_nc_u16 v35, v65, 3
-; GFX11-NEXT:    v_add_nc_u16 v36, v64, 3
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v114, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v115, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v116, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v117, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v118, v31
-; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-NEXT:    v_or_b32_e32 v33, v100, v33
-; GFX11-NEXT:    v_or_b32_e32 v34, v101, v34
-; GFX11-NEXT:    v_or_b32_e32 v35, v102, v35
-; GFX11-NEXT:    v_or_b32_e32 v36, v103, v36
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_add_nc_u16 v32, 0x300, v32
-; GFX11-NEXT:    v_add_nc_u16 v33, 0x300, v33
-; GFX11-NEXT:    v_add_nc_u16 v34, 0x300, v34
-; GFX11-NEXT:    v_add_nc_u16 v35, 0x300, v35
-; GFX11-NEXT:    v_add_nc_u16 v36, 0x300, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:  .LBB29_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:392
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:396
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:400
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:404
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:408
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:412
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:416
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:420
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:424
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:428
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:432
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:436
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:440
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:444
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:448
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:452
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:456
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:460
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:464
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:468
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:472
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:476
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:480
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:484
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:488
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:492
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:496
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:500
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:504
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:508
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:512
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:520
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:524
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:528
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:532
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:536
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:540
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:544
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:548
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:552
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:556
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:560
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:564
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:568
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:572
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:576
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:580
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:584
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:588
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:592
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB29_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB29_4
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT:  .LBB29_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:516
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v160, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v161, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v167, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v176, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v177, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v144, 8, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v145, 8, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v146, 8, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v124
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v125
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v126
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v127
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v111
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v121
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v120
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v122
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v123
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v107
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v108
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v109
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v110
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v106
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v6, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v92
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v78
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v77
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v76
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v93
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v94
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v95
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v104
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v105
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v79
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v88
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v89
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v90
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v91
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v40
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v61
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v62
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v72
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v47
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v160
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v132
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v179
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v183
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v162
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v163
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v164
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v165
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v144
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v145
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v146
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v147
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v148
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v119
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v128
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v129
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v130
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v32, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v33, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v34, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v35, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr124
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr125
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr126
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr127
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr111
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr120
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr121
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr122
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr106
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr107
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr109
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr93
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr95
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr105
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB29_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v51, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v50, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v124, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v125, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v126, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v127, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v48, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v36, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v35, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v111, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v120, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v121, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v122, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v107, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v108, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v109, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v38, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v110, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v106, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v32, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v92, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v78, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v77, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v76, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v75, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v74, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v60, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v59, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v93, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v94, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v95, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v104, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v105, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v79, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v88, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v89, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v90, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v91, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v58, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v44, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v43, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v42, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v41, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v40, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v178, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v177, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v176, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v62, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v63, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v72, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v73, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v45, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v46, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v47, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v56, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v57, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v160, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v151, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v150, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v149, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v135, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v134, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v133, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v132, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v180, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v181, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v182, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v183, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v162, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v163, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v164, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v165, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v112, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v98, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v97, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v96, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v87, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v84, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v83, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v145, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v146, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v147, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v148, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v119, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v128, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v129, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v130, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v81, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, v66, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, v65, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, v64, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v114, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v115, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v116, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v117, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v118, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v100, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v101, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v102, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, 0x300, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, 0x300, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, 0x300, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, 0x300, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, 0x300, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:  .LBB29_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -45192,550 +50846,1113 @@ define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v16i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v32
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB31_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v32, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v39, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v36, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v34, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v35, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v35, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
-; GFX11-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_bfe_u32 v34, v4, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v4
-; GFX11-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v35, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v34, v0, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v0
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v31
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v29
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_bfe_u32 v34, v28, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v28
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v27
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_bfe_u32 v35, v26, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v25
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v34, v24, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v24
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_bfe_u32 v35, v22, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v22
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v36, v34, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v21, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v21
-; GFX11-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
-; GFX11-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_bfe_u32 v38, v33, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v34, v20, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
-; GFX11-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_bfe_u32 v48, v18, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v35
-; GFX11-NEXT:    v_bfe_u32 v39, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v48, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v36
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v16
-; GFX11-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_bfe_u32 v37, v16, 16, 1
-; GFX11-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
-; GFX11-NEXT:  .LBB31_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB31_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v39, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v32.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v32, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v48, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v38, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v32, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v33
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v32
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v34
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v31, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v30, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v29.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v29, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v27, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v26, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v25, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v24, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v23, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v21, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v33, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v36, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v39, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v20, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v19, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v50, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v18, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v17, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v36, v16
+; GFX11-TRUE16-NEXT:  .LBB31_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB31_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v33, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB31_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -52277,676 +58494,1224 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16f64_to_v128i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:88
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:84
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:80
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:76
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:72
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:68
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:64
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:60
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB36_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:  .LBB36_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB36_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
-; GFX11-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; GFX11-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
-; GFX11-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; GFX11-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
-; GFX11-NEXT:  .LBB36_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v75
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v63
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v65
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xff, v61
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v39, v55, v39
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v73
-; GFX11-NEXT:    v_or_b32_e32 v65, v67, v65
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v58
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v72
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v39
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v55, v55, v66
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v62
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v66
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v60
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v59
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v67
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v57
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v56
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v65
-; GFX11-NEXT:    v_or_b32_e32 v55, v66, v64
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v47
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v46
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v45
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v39
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v64
-; GFX11-NEXT:    v_or_b32_e32 v55, v65, v66
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v44
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v43
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v54, v65, v54
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v42
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v41
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v40
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v55
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v54, v64, v65
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v182
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v183
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v181
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v53, v64, v53
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v55
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v65
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v39
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v53, 0xff, v180
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v179
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v178
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v177
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v176
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v167
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v166
-; GFX11-NEXT:    v_or_b32_e32 v53, v53, v54
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v55
-; GFX11-NEXT:    v_or_b32_e32 v52, v64, v52
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v65
-; GFX11-NEXT:    v_or_b32_e32 v54, v66, v67
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    v_or_b32_e32 v1, v9, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v10, v53
-; GFX11-NEXT:    v_or_b32_e32 v3, v11, v52
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v54
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v165
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v164
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v163
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v162
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v161
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v160
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v151
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v150
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v149
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v148
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v147
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v146
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v39, v49
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v145
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v144
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v135
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v134
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v133
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v132
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v131
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v130
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v21
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v129
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v128
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v119
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v118
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v117
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v38, v39
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v116
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v115
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v37
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v114
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v113
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v103
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v102
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v101
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v100
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v99
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v98
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v97
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v35
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v36, v35
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v96
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v87
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v86
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v84
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v83
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v82
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v81
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b16 v28, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_lshlrev_b16 v30, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v70
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v69
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v68
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v26, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v27, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v28, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v29, v33, v34
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:88
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16f64_to_v128i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB36_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB36_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB36_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB36_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v1.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v55, v39
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v67
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v39, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v66
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v39, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v39, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v55
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v53
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v64
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v53, v55
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v55, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.h, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v55, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v65
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v53, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB36_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:  .LBB36_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB36_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB36_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v63
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v67, 0xff, v61
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, v55, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, v67, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v72
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v55, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v62
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v66, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v47
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v46
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v65, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v65, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v40
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v64, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v181
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v64, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v53, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v166
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v53, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v64, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v66, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v160
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v148
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v39, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v144
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v132
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v38, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v115
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v36, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v81
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v33, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -57038,1038 +63803,1985 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v128i8_to_v16f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:592
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:588
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:584
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:580
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:576
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:572
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:568
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:564
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:560
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:556
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:552
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:548
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:544
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:540
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:536
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:532
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:528
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:524
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:520
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:516
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:512
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:508
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:504
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:500
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:496
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:492
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:488
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:484
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:480
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:476
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:472
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:468
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:464
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:460
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:456
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:452
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:448
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:444
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:440
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:436
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:432
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:428
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:424
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:420
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:416
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:412
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:408
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:404
-; GFX11-NEXT:    scratch_store_b32 off, v136, s32 offset:400
-; GFX11-NEXT:    scratch_store_b32 off, v137, s32 offset:396
-; GFX11-NEXT:    scratch_store_b32 off, v138, s32 offset:392
-; GFX11-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
-; GFX11-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
-; GFX11-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
-; GFX11-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
-; GFX11-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
-; GFX11-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
-; GFX11-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:380
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:372
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:364
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:356
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:348
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:340
-; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:332
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:324
-; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:316
-; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:308
-; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:300
-; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:292
-; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:260
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:252
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v114, off, s32 offset:388
-; GFX11-NEXT:    scratch_load_u16 v115, off, s32
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:152
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_u16 v181, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_u16 v182, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_u16 v136, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_u16 v137, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_u16 v138, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_u16 v132, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_u16 v134, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_u16 v150, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_u16 v160, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_u16 v161, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_u16 v167, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_u16 v176, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v177, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v41, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v42, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v124, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v125, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v126, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v127, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v111, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v120, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v121, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v122, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v123, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v106, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v107, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v108, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v109, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v110, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v93, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-NEXT:    v_lshlrev_b16 v94, 8, v115
-; GFX11-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-NEXT:    v_lshlrev_b16 v95, 8, v116
-; GFX11-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-NEXT:    v_lshlrev_b16 v104, 8, v117
-; GFX11-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-NEXT:    v_lshlrev_b16 v105, 8, v118
-; GFX11-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-NEXT:    v_lshlrev_b16 v79, 8, v119
-; GFX11-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-NEXT:    v_lshlrev_b16 v88, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-NEXT:    v_lshlrev_b16 v89, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-NEXT:    v_lshlrev_b16 v90, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-NEXT:    v_lshlrev_b16 v91, 8, v131
-; GFX11-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-NEXT:    v_lshlrev_b16 v61, 8, v144
-; GFX11-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-NEXT:    v_lshlrev_b16 v62, 8, v145
-; GFX11-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-NEXT:    v_lshlrev_b16 v63, 8, v146
-; GFX11-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-NEXT:    v_lshlrev_b16 v72, 8, v147
-; GFX11-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-NEXT:    v_lshlrev_b16 v73, 8, v148
-; GFX11-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-NEXT:    v_lshlrev_b16 v45, 8, v162
-; GFX11-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-NEXT:    v_lshlrev_b16 v46, 8, v163
-; GFX11-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-NEXT:    v_lshlrev_b16 v47, 8, v164
-; GFX11-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-NEXT:    v_lshlrev_b16 v56, 8, v165
-; GFX11-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-NEXT:    v_lshlrev_b16 v57, 8, v166
-; GFX11-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-NEXT:    v_lshlrev_b16 v179, 8, v179
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v180, 8, v180
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshlrev_b16 v181, 8, v181
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v182, 8, v182
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b16 v183, 8, v183
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v162, 8, v136
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b16 v163, 8, v137
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v164, 8, v138
-; GFX11-NEXT:    v_lshlrev_b16 v165, 8, v103
-; GFX11-NEXT:    v_lshlrev_b16 v166, 8, v102
-; GFX11-NEXT:    v_lshlrev_b16 v144, 8, v101
-; GFX11-NEXT:    v_lshlrev_b16 v145, 8, v100
-; GFX11-NEXT:    v_lshlrev_b16 v146, 8, v99
-; GFX11-NEXT:    v_lshlrev_b16 v147, 8, v31
-; GFX11-NEXT:    v_lshlrev_b16 v148, 8, v30
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v28
-; GFX11-NEXT:    v_lshlrev_b16 v128, 8, v26
-; GFX11-NEXT:    v_lshlrev_b16 v129, 8, v24
-; GFX11-NEXT:    v_lshlrev_b16 v130, 8, v22
-; GFX11-NEXT:    v_lshlrev_b16 v131, 8, v20
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v18
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v16
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v14
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v12
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v0
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB37_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v51
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v124
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v125
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v126
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v127
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v37
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v111
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v121
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v120
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v122
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v123
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v107
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v108
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v109
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v110
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v106
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v6, v12
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v92
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v78
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v77
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v76
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v75
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v74
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v60
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v59
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v93
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v94
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v95
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v104
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v105
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v79
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v88
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v89
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v90
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v91
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v58
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v44
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v43
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v42
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v41
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v40
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v178
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v177
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v176
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v167
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v61
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v62
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v63
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v72
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v73
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v45
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v46
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v47
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v56
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v57
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v161
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v160
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v151
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v150
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v149
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v135
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v134
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v133
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v132
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v113
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v179
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v180
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v181
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v182
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v183
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v162
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v163
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v164
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v165
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v166
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v112
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v98
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v96
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v84
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v83
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v82
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v144
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v145
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v146
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v147
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v148
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v119
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v128
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v129
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v130
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v131
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v66
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v64
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v114
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v115
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v116
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v117
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v118
-; GFX11-NEXT:    v_or_b32_e32 v32, v32, v99
-; GFX11-NEXT:    v_or_b32_e32 v33, v33, v100
-; GFX11-NEXT:    v_or_b32_e32 v34, v34, v101
-; GFX11-NEXT:    v_or_b32_e32 v35, v35, v102
-; GFX11-NEXT:    v_or_b32_e32 v36, v36, v103
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr92
-; GFX11-NEXT:    ; implicit-def: $vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr77
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr124
-; GFX11-NEXT:    ; implicit-def: $vgpr125
-; GFX11-NEXT:    ; implicit-def: $vgpr126
-; GFX11-NEXT:    ; implicit-def: $vgpr127
-; GFX11-NEXT:    ; implicit-def: $vgpr111
-; GFX11-NEXT:    ; implicit-def: $vgpr120
-; GFX11-NEXT:    ; implicit-def: $vgpr121
-; GFX11-NEXT:    ; implicit-def: $vgpr122
-; GFX11-NEXT:    ; implicit-def: $vgpr123
-; GFX11-NEXT:    ; implicit-def: $vgpr106
-; GFX11-NEXT:    ; implicit-def: $vgpr107
-; GFX11-NEXT:    ; implicit-def: $vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr109
-; GFX11-NEXT:    ; implicit-def: $vgpr110
-; GFX11-NEXT:    ; implicit-def: $vgpr93
-; GFX11-NEXT:    ; implicit-def: $vgpr94
-; GFX11-NEXT:    ; implicit-def: $vgpr95
-; GFX11-NEXT:    ; implicit-def: $vgpr104
-; GFX11-NEXT:    ; implicit-def: $vgpr105
-; GFX11-NEXT:    ; implicit-def: $vgpr79
-; GFX11-NEXT:    ; implicit-def: $vgpr88
-; GFX11-NEXT:    ; implicit-def: $vgpr89
-; GFX11-NEXT:    ; implicit-def: $vgpr90
-; GFX11-NEXT:    ; implicit-def: $vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:  .LBB37_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB37_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v51, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v5, v50, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v124, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v125, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v126, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v127, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v7, v48, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_add_nc_u16 v8, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v36, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v10, v35, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v2, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v123, v2
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_or_b32_e32 v3, v111, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v120, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v121, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, v122, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v107, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v108, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v109, v10
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v38, 3
-; GFX11-NEXT:    v_or_b32_e32 v11, v110, v11
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v106, v2
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v6, v10, v11
-; GFX11-NEXT:    v_add_nc_u16 v7, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v32, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v9, v92, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v78, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v77, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v76, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v75, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v74, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v60, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v59, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v93, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v94, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v95, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v104, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v105, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v79, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v88, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v89, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v90, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v91, v16
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v16
-; GFX11-NEXT:    v_add_nc_u16 v12, v58, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v44, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v43, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v42, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v41, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v40, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v178, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v177, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v176, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v167, 3
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v61, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v62, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v63, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v72, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v73, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v45, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v46, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v47, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v56, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v57, v21
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v14, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v16, v20, v21
-; GFX11-NEXT:    v_add_nc_u16 v17, v161, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v160, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v151, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v150, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v149, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v135, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v134, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v133, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v132, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v113, 3
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v179, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v180, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v181, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v182, v20
-; GFX11-NEXT:    v_or_b32_e32 v21, v183, v21
-; GFX11-NEXT:    v_or_b32_e32 v22, v162, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v163, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v164, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v165, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v166, v26
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_add_nc_u16 v22, v112, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v98, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v97, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, v96, 3
-; GFX11-NEXT:    v_add_nc_u16 v26, v87, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v84, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v83, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v82, 3
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v144, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v145, v23
-; GFX11-NEXT:    v_or_b32_e32 v24, v146, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v147, v25
-; GFX11-NEXT:    v_or_b32_e32 v26, v148, v26
-; GFX11-NEXT:    v_or_b32_e32 v27, v119, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v128, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v129, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v130, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v131, v31
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v24
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v25
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v26
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v23, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v25, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v30, v31
-; GFX11-NEXT:    v_add_nc_u16 v27, v81, 3
-; GFX11-NEXT:    v_add_nc_u16 v28, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v29, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v30, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v32, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v33, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v34, v66, 3
-; GFX11-NEXT:    v_add_nc_u16 v35, v65, 3
-; GFX11-NEXT:    v_add_nc_u16 v36, v64, 3
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v114, v27
-; GFX11-NEXT:    v_or_b32_e32 v28, v115, v28
-; GFX11-NEXT:    v_or_b32_e32 v29, v116, v29
-; GFX11-NEXT:    v_or_b32_e32 v30, v117, v30
-; GFX11-NEXT:    v_or_b32_e32 v31, v118, v31
-; GFX11-NEXT:    v_or_b32_e32 v32, v99, v32
-; GFX11-NEXT:    v_or_b32_e32 v33, v100, v33
-; GFX11-NEXT:    v_or_b32_e32 v34, v101, v34
-; GFX11-NEXT:    v_or_b32_e32 v35, v102, v35
-; GFX11-NEXT:    v_or_b32_e32 v36, v103, v36
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v27
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v28
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v29
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v30
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v31
-; GFX11-NEXT:    v_add_nc_u16 v32, 0x300, v32
-; GFX11-NEXT:    v_add_nc_u16 v33, 0x300, v33
-; GFX11-NEXT:    v_add_nc_u16 v34, 0x300, v34
-; GFX11-NEXT:    v_add_nc_u16 v35, 0x300, v35
-; GFX11-NEXT:    v_add_nc_u16 v36, 0x300, v36
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v28, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v29, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v30, v33, v34
-; GFX11-NEXT:    v_or_b32_e32 v31, v35, v36
-; GFX11-NEXT:  .LBB37_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v138, off, s32 offset:392
-; GFX11-NEXT:    scratch_load_b32 v137, off, s32 offset:396
-; GFX11-NEXT:    scratch_load_b32 v136, off, s32 offset:400
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:404
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:408
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:412
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:416
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:420
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:424
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:428
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:432
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:436
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:440
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:444
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:448
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:452
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:456
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:460
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:464
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:468
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:472
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:476
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:480
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:484
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:488
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:492
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:496
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:500
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:504
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:508
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:512
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:516
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:520
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:524
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:528
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:532
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:536
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:540
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:544
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:548
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:552
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:556
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:560
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:564
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:568
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:572
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:576
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:580
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:584
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:588
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:592
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v16f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v49, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v51, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v103, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v81, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v81, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v82, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v83, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v87, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v99, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v99, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v160, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v160, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v161, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v161, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v162, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v162, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v163, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v163, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v164, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v164, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v165, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v165, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v97, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v101, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v102, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v102, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v118.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.l, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v49.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v64.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v68.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v103
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v81.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v82.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v83.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v86.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v87.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v99.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v100.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.l, 8, v160.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v100.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v161.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v86.h, 8, v162.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v163.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.h, 8, v163.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v96.l, 8, v164.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v164.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v165.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.l, 8, v165.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v69.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v69.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v65.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB37_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB37_4
+; GFX11-TRUE16-NEXT:  .LBB37_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB37_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v149.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v148.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v150.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v151.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v145.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v146.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v147.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v131.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v119.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v130.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v133.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v134.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v118.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v128.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v114.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v113.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v129.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v116.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v8.h, v117.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v103.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v113.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v85.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v13.h, v100.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v86.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v17.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v82.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v18.h, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v83.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v68.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v68.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v69.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v55.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v64.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v22.h, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v23.h, v65.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v27.h, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v28.h, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v51.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB37_2
+; GFX11-TRUE16-NEXT:  .LBB37_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v149.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v149.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v148.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v145.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v148.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v145.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v144.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v144.l, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v135.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v135.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v146.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v147.h, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v147.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v132.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v131.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v131.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v119.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v132.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v133.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v130.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v119.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v118.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v133.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v134.h, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v134.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v128.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v128.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v11, v12
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v114.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v114.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v115.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v113.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v129.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v130.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v116.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v129.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v116.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v102.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, v101.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, v102.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v117.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v118.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v103.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v117.h, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v97.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v103.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v97.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v96.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v85.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v112.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v85.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v112.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v113.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v99.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v99.h, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v17, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v100.l, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v100.h, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v101.l, v14.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v86.l, v17.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v13.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v19, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v86.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v21, v22
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v87.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v96.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v87.h, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v81.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, v52.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v81.h, v17.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v22, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v82.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v82.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v83.l, v19.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v21
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v18.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v68.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v69.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v24, v25
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v68.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v69.h, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v21, v29
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v70.l, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v55.h, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v64.h, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v26
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v64.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v65.l, v23.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v22.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v29, v30
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v65.h, v24.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v53.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v54.l, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v55.l, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v53.h, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v54.h, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v31, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v26, v39
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v29
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v49.h, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v50.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v50.h, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v51.l, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v51.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v25, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v31
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v27.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v28.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v35, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v37, v38
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v39, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v30, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v31, v34
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v16f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:592
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:516
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:468
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v136, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v137, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v138, s32 offset:392
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v114, off, s32 offset:388
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v181, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v136, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v137, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v138, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v132, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v150, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v160, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v161, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v167, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v176, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v177, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v41, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v114
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v115
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v116
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v117
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v118
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v119
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v131
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v144
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v145
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v146
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v147
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v148
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v162
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v163
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v164
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v165
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v166
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v179, 8, v179
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v180, 8, v180
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v181
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v182
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v183, 8, v183
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v162, 8, v136
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v163, 8, v137
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v164, 8, v138
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v165, 8, v103
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v166, 8, v102
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v144, 8, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v145, 8, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v146, 8, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v147, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v148, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v129, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v130, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v131, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v0
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB37_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v124
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v125
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v126
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v127
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v111
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v121
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v120
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v122
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v123
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v107
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v108
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v109
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v110
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v106
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v6, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v92
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v78
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v77
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v76
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v93
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v94
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v95
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v104
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v105
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v79
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v88
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v89
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v90
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v91
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v44
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v40
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v61
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v62
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v72
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v47
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v56
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v160
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v132
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v179
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v180
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v183
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v162
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v163
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v164
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v165
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v166
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v144
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v145
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v146
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v147
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v148
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v119
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v128
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v129
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v130
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v32, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v33, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v34, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v35, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v36, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr124
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr125
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr126
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr127
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr111
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr120
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr121
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr122
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr106
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr107
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr109
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr93
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr95
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr105
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:  .LBB37_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB37_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v51, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v50, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v124, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v125, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v126, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v127, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v48, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v36, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v35, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v111, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v120, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v121, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v122, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v107, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v108, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v109, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v38, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v110, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v106, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v10, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v32, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v92, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v78, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v77, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v76, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v75, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v74, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v60, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v59, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v93, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v94, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v95, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v104, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v105, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v79, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v88, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v89, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v90, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v91, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v58, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v44, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v43, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v42, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v41, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v40, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v178, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v177, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v176, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v167, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v61, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v62, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v63, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v72, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v73, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v45, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v46, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v47, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v56, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v57, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v161, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v160, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v151, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v150, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v149, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v135, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v134, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v133, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v132, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v113, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v179, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v180, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v181, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v182, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v183, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v162, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v163, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v164, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v165, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v166, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v112, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v98, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v97, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, v96, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, v87, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v84, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v83, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v82, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v144, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v145, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v146, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v147, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v148, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v119, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v128, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v129, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v130, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v131, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v25
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v26
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v30, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, v81, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, v66, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, v65, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, v64, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v114, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v115, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v116, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v117, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v118, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v99, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v100, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v101, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v102, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v103, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v27
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v28
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v29
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v30
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v31
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, 0x300, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, 0x300, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, 0x300, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, 0x300, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, 0x300, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v33, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v35, v36
+; GFX11-FAKE16-NEXT:  .LBB37_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v138, off, s32 offset:392
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v137, off, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v136, off, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:516
+; GFX11-FAKE16-NEXT:    s_clause 0x12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:584
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:588
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:592
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -60671,550 +68383,1113 @@ define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v16f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v32
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB39_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v32, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v39, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v36, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v34, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v35, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v35, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v33, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
-; GFX11-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_bfe_u32 v34, v4, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v33, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v4
-; GFX11-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v35, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_bfe_u32 v33, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v34, v0, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_bfe_u32 v33, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v0
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v31, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
-; GFX11-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v31
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_bfe_u32 v35, v30, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v29, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v29
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_bfe_u32 v34, v28, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v28
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
-; GFX11-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v27
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_bfe_u32 v35, v26, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v26
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v32, v34, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v25
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v34, v24, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v24
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-NEXT:    v_bfe_u32 v32, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_bfe_u32 v35, v22, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v22
-; GFX11-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v36, v34, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v32, v21, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v21
-; GFX11-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
-; GFX11-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_bfe_u32 v38, v33, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v34, v20, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
-; GFX11-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_bfe_u32 v48, v18, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; GFX11-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v38, v35, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
-; GFX11-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v35
-; GFX11-NEXT:    v_bfe_u32 v39, v17, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_bfe_u32 v48, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v36
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v16
-; GFX11-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_bfe_u32 v37, v16, 16, 1
-; GFX11-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
-; GFX11-NEXT:  .LBB39_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v16f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB39_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v32, v35, v38 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v15, 0x40c00000, v15 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v39, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v32.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_cndmask_b32 v15, v36, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v32, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v48, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v35, v36 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v38, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v36, v37 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v32, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v33
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v32
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_cndmask_b32 v9, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v3, v32, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v38 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v34
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v31, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v30.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v30, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v29.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v29, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v27.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v27, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v26, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v25.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v25, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v24, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v32, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v23, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v35, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v33
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v21, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v33, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v32, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v36, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v39, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v33, v38 :: v_dual_and_b32 v38, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v20, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v38 :: v_dual_cndmask_b32 v34, v34, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v19, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v50, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v18, v34
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v36.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v17, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v36, v16
+; GFX11-TRUE16-NEXT:  .LBB39_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v16f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB39_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v39, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v29
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v31, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v30, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v29, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v27
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v29, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v28, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v27, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v27, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v34, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v26, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v25, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v34, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v24, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v33, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v33, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v23, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v22, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v35
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v36, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v32, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v33, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v32, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v38, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v21, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v20, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v37, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v48, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v19, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v37, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v18, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v39, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v17, v35, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v37, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v16, v36, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB39_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -69155,923 +77430,1689 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v128i8_to_v64bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:580
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:576
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:572
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:568
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:564
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:560
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:556
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:552
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:548
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:544
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:540
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:536
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:532
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:528
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:524
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:520
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:516
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:512
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:508
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:504
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:500
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:496
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:492
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:488
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:484
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:480
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:476
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:472
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:468
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:464
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:460
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:452
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:448
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:444
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:440
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:436
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:432
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:428
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:424
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:420
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:416
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:412
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:408
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:404
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:400
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:396
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:392
-; GFX11-NEXT:    v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24
-; GFX11-NEXT:    v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26
-; GFX11-NEXT:    v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20
-; GFX11-NEXT:    v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16
-; GFX11-NEXT:    v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4
-; GFX11-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:380
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-NEXT:    scratch_load_u16 v134, off, s32 offset:372
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:364
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:356
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:348
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:340
-; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:332
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:324
-; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:316
-; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:308
-; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:300
-; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:292
-; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:260
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:252
-; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_u16 v88, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_u16 v93, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_u16 v91, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v150, off, s32 offset:388
-; GFX11-NEXT:    scratch_load_u16 v182, off, s32
-; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v61, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v63, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v72, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v73, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:152
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_u16 v79, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_u16 v89, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_u16 v90, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_u16 v95, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_u16 v104, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_u16 v105, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_u16 v42, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_u16 v177, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v70, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v82, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v128, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v132, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v161, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v160, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v176, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v167, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v181, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-NEXT:    v_lshlrev_b16 v127, 8, v0
-; GFX11-NEXT:    v_lshlrev_b16 v126, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v124, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v125, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v120, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v123, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v121, 8, v12
-; GFX11-NEXT:    v_lshlrev_b16 v122, 8, v14
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    v_lshlrev_b16 v106, 8, v16
-; GFX11-NEXT:    v_lshlrev_b16 v111, 8, v18
-; GFX11-NEXT:    v_lshlrev_b16 v109, 8, v20
-; GFX11-NEXT:    v_lshlrev_b16 v110, 8, v22
-; GFX11-NEXT:    v_lshlrev_b16 v107, 8, v24
-; GFX11-NEXT:    v_lshlrev_b16 v108, 8, v26
-; GFX11-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-NEXT:    v_lshlrev_b16 v88, 8, v88
-; GFX11-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-NEXT:    v_lshlrev_b16 v93, 8, v93
-; GFX11-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-NEXT:    v_lshlrev_b16 v91, 8, v91
-; GFX11-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-NEXT:    v_lshlrev_b16 v92, 8, v92
-; GFX11-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
-; GFX11-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-NEXT:    v_lshlrev_b16 v150, 8, v182
-; GFX11-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-NEXT:    v_lshlrev_b16 v41, 8, v40
-; GFX11-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-NEXT:    v_lshlrev_b16 v40, 8, v43
-; GFX11-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-NEXT:    v_lshlrev_b16 v43, 8, v44
-; GFX11-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-NEXT:    v_lshlrev_b16 v182, 8, v45
-; GFX11-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-NEXT:    v_lshlrev_b16 v46, 8, v46
-; GFX11-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-NEXT:    v_lshlrev_b16 v45, 8, v47
-; GFX11-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-NEXT:    v_lshlrev_b16 v57, 8, v56
-; GFX11-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-NEXT:    v_lshlrev_b16 v56, 8, v58
-; GFX11-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-NEXT:    v_lshlrev_b16 v58, 8, v59
-; GFX11-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-NEXT:    v_lshlrev_b16 v44, 8, v60
-; GFX11-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-NEXT:    v_lshlrev_b16 v60, 8, v61
-; GFX11-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-NEXT:    v_lshlrev_b16 v59, 8, v62
-; GFX11-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-NEXT:    v_lshlrev_b16 v62, 8, v63
-; GFX11-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-NEXT:    v_lshlrev_b16 v47, 8, v72
-; GFX11-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-NEXT:    v_lshlrev_b16 v72, 8, v73
-; GFX11-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-NEXT:    v_lshlrev_b16 v63, 8, v74
-; GFX11-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-NEXT:    v_lshlrev_b16 v74, 8, v75
-; GFX11-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-NEXT:    v_lshlrev_b16 v73, 8, v76
-; GFX11-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-NEXT:    v_lshlrev_b16 v75, 8, v77
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v61, 8, v78
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshlrev_b16 v78, 8, v79
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v77, 8, v89
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b16 v79, 8, v90
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v76, 8, v95
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b16 v90, 8, v104
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v89, 8, v105
-; GFX11-NEXT:    v_lshlrev_b16 v104, 8, v94
-; GFX11-NEXT:    v_lshlrev_b16 v95, 8, v31
-; GFX11-NEXT:    v_lshlrev_b16 v105, 8, v30
-; GFX11-NEXT:    v_lshlrev_b16 v94, 8, v28
-; GFX11-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB44_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v48
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v84
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v51
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v34
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v115
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v66
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v128
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v113
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v132
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v100
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v161
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v160
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v176
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v167
-; GFX11-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v11, v10, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v37
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v102
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v114
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v96
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v133
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v135
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v130
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v181
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v150
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v41
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v40
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v43
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v182
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v46
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v45
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v57
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v56
-; GFX11-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v14, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v16, v15, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v147
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v119
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v149
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v144
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v162
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v146
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v178
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v164
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v151
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v148
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v58
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v44
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v60
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v59
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v62
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v47
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v72
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v63
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v74
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v73
-; GFX11-NEXT:    v_perm_b32 v12, v13, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v15, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v17, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v19, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v16, v21, v20, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v166
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v145
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v177
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v163
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v179
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v165
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v183
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v180
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v42
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v65
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v75
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v61
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v78
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v77
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v79
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v76
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v90
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v89
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v92
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v91
-; GFX11-NEXT:    v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v18, v20, v19, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v19, v22, v21, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v20, v24, v23, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v21, v26, v25, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v83
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v101
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v93
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v88
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v104
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v95
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v105
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v94
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v108
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v107
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v110
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v109
-; GFX11-NEXT:    v_perm_b32 v22, v23, v22, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v23, v25, v24, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v24, v27, v26, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v25, v29, v28, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v26, v31, v30, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v103
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v112
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v99
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v129
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v98
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v131
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v116
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v134
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v118
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v111
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v106
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v122
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v121
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v123
-; GFX11-NEXT:    v_or_b32_e32 v32, v32, v120
-; GFX11-NEXT:    v_or_b32_e32 v33, v33, v125
-; GFX11-NEXT:    v_or_b32_e32 v34, v34, v124
-; GFX11-NEXT:    v_or_b32_e32 v35, v35, v126
-; GFX11-NEXT:    v_or_b32_e32 v36, v36, v127
-; GFX11-NEXT:    v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v28, v30, v29, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v29, v32, v31, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v30, v34, v33, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v31, v36, v35, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr77
-; GFX11-NEXT:    ; implicit-def: $vgpr79
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr90
-; GFX11-NEXT:    ; implicit-def: $vgpr89
-; GFX11-NEXT:    ; implicit-def: $vgpr92
-; GFX11-NEXT:    ; implicit-def: $vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr93
-; GFX11-NEXT:    ; implicit-def: $vgpr88
-; GFX11-NEXT:    ; implicit-def: $vgpr104
-; GFX11-NEXT:    ; implicit-def: $vgpr95
-; GFX11-NEXT:    ; implicit-def: $vgpr105
-; GFX11-NEXT:    ; implicit-def: $vgpr94
-; GFX11-NEXT:    ; implicit-def: $vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr107
-; GFX11-NEXT:    ; implicit-def: $vgpr110
-; GFX11-NEXT:    ; implicit-def: $vgpr109
-; GFX11-NEXT:    ; implicit-def: $vgpr111
-; GFX11-NEXT:    ; implicit-def: $vgpr106
-; GFX11-NEXT:    ; implicit-def: $vgpr122
-; GFX11-NEXT:    ; implicit-def: $vgpr121
-; GFX11-NEXT:    ; implicit-def: $vgpr123
-; GFX11-NEXT:    ; implicit-def: $vgpr120
-; GFX11-NEXT:    ; implicit-def: $vgpr125
-; GFX11-NEXT:    ; implicit-def: $vgpr124
-; GFX11-NEXT:    ; implicit-def: $vgpr126
-; GFX11-NEXT:    ; implicit-def: $vgpr127
-; GFX11-NEXT:  .LBB44_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB44_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v134, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v118, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v131, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v116, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v129, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v35, v35, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v126, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v127, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v125, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v124, v3
-; GFX11-NEXT:    v_add_nc_u16 v33, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v98, 3
-; GFX11-NEXT:    v_add_nc_u16 v116, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v98, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v1, v112, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u16 v3, v99, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v103, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v123, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v120, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v122, v1
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v121, v3
-; GFX11-NEXT:    v_add_nc_u16 v99, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v111, v4
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v81, 3
-; GFX11-NEXT:    v_add_nc_u16 v81, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v101, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v86, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v97, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v83, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v106, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v110, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v109, v0
-; GFX11-NEXT:    v_add_nc_u16 v83, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v108, v3
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v107, v4
-; GFX11-NEXT:    v_add_nc_u16 v86, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v67, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v80, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v68, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v69, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v105, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v94, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v104, v2
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v95, v3
-; GFX11-NEXT:    v_add_nc_u16 v68, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v93, v4
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v64, 3
-; GFX11-NEXT:    v_add_nc_u16 v64, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_add_nc_u16 v0, v42, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v65, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_add_nc_u16 v3, v183, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v180, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v88, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v91, v1
-; GFX11-NEXT:    v_add_nc_u16 v65, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v90, v3
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v89, v4
-; GFX11-NEXT:    v_add_nc_u16 v69, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_add_nc_u16 v1, v179, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v165, 3
-; GFX11-NEXT:    v_add_nc_u16 v80, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_add_nc_u16 v0, v177, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v163, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_add_nc_u16 v4, v166, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v79, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v76, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v78, v0
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-NEXT:    v_add_nc_u16 v85, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v75, v4
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v145, 3
-; GFX11-NEXT:    v_add_nc_u16 v97, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_add_nc_u16 v1, v151, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v148, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_add_nc_u16 v3, v178, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v164, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v61, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v74, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v73, v2
-; GFX11-NEXT:    v_add_nc_u16 v101, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v63, v4
-; GFX11-NEXT:    v_add_nc_u16 v103, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_add_nc_u16 v2, v162, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v146, 3
-; GFX11-NEXT:    v_add_nc_u16 v112, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_add_nc_u16 v1, v149, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v144, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u16 v4, v147, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v62, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v47, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v60, v1
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v59, v3
-; GFX11-NEXT:    v_add_nc_u16 v118, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v58, v4
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v119, 3
-; GFX11-NEXT:    v_add_nc_u16 v119, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_nc_u16 v2, v135, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v130, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_nc_u16 v3, v133, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v117, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v44, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v57, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-NEXT:    v_add_nc_u16 v117, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v46, v3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v45, v4
-; GFX11-NEXT:    v_add_nc_u16 v129, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v0, v114, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v96, 3
-; GFX11-NEXT:    v_add_nc_u16 v96, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v2, v102, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v87, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v55, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v43, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v182, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v41, v2
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v40, v3
-; GFX11-NEXT:    v_add_nc_u16 v55, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v181, v4
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v37, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v52, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v53, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v50, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v150, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v176, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v167, v1
-; GFX11-NEXT:    v_add_nc_u16 v50, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v161, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v160, v4
-; GFX11-NEXT:    v_add_nc_u16 v52, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v51, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v49, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v48, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v36, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v132, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v128, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v115, v0
-; GFX11-NEXT:    v_add_nc_u16 v34, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v100, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v113, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v39, 3
-; GFX11-NEXT:    v_add_nc_u16 v36, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v32, v32, 3
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-NEXT:    v_or_b32_e32 v35, v71, v35
-; GFX11-NEXT:    v_or_b32_e32 v33, v82, v33
-; GFX11-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-NEXT:    v_or_b32_e32 v36, v70, v36
-; GFX11-NEXT:    v_or_b32_e32 v32, v66, v32
-; GFX11-NEXT:    v_add_nc_u16 v35, 0x300, v35
-; GFX11-NEXT:    v_add_nc_u16 v33, 0x300, v33
-; GFX11-NEXT:    v_add_nc_u16 v38, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v36
-; GFX11-NEXT:    v_add_nc_u16 v32, 0x300, v32
-; GFX11-NEXT:    v_add_nc_u16 v36, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v39, 0x300, v1
-; GFX11-NEXT:    v_perm_b32 v1, v33, v38, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v32, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v39, v34, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v52, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v50, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v37, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v96, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v129, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v117, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v119, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v118, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v112, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v16, v103, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v17, v101, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v18, v97, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v19, v85, v19, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v20, v80, v20, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v21, v69, v21, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v24, v68, v24, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v25, v67, v25, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v26, v86, v26, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v27, v83, v27, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v28, v81, v28, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v29, v99, v29, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v30, v98, v30, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
-; GFX11-NEXT:  .LBB44_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:392
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:396
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:400
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:404
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:408
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:412
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:416
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:420
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:424
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:428
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:432
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:436
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:440
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:444
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:448
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:452
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:456
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:460
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:464
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:468
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:472
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:476
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:480
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:484
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:488
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:492
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:496
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:500
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:504
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:508
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:512
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:520
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:524
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:528
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:532
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:536
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:540
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:544
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:548
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:552
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:556
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:560
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:564
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:568
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:572
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:576
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:580
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v117, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v118, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v119, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v128, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v132, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB44_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB44_4
+; GFX11-TRUE16-NEXT:  .LBB44_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB44_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v85.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v100.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v113.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v115.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v118.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v114.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v119.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v128.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v117.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v130.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v129.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v133.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v133.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v144.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v134.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v145.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v146.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v146.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v148.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v149.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v149.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v147.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v150.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v151.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB44_2
+; GFX11-TRUE16-NEXT:  .LBB44_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v50.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v147.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v148.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v145.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v146.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v145.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v135.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v144.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v134.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v135.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v132.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v133.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v134.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v132.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v130.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v117.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v128.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v129.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v119.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v128.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v119.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v114.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v117.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v114.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v116.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v116.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v115.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v113.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v112.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v65.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v112.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v102.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v103.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v102.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v99.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v87.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v98.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v97.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v70.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v82.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v83.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v81.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v32.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
+; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:392
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v150, off, s32 offset:388
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v61, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v63, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v72, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v73, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v177, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v70, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v132, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v161, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v160, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v176, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v167, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v150, 8, v182
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v41, 8, v40
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v40, 8, v43
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v43, 8, v44
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v45
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v46
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v47
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v56
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v58
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v58, 8, v59
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v44, 8, v60
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v60, 8, v61
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v59, 8, v62
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v63
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v72
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v73
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v74
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v74, 8, v75
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v76
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v75, 8, v77
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v78
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v78, 8, v79
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v77, 8, v89
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v90
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v76, 8, v95
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v104
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v105
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v94
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB44_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v128
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v132
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v161
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v160
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v176
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v167
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v9, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v37
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v102
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v130
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v150
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v41
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v40
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v43
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v57
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v56
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v14, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v16, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v144
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v148
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v58
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v44
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v60
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v59
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v62
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v47
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v72
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v74
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v73
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v13, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v15, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v17, v16, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v19, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v21, v20, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v166
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v75
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v61
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v78
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v77
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v79
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v76
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v90
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v89
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v92
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v91
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v93
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v88
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v104
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v95
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v105
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v94
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v108
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v107
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v110
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v109
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v23, v22, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v25, v24, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v27, v26, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v29, v28, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v31, v30, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v111
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v106
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v122
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v121
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v123
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v32, v120
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v33, v125
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v34, v124
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v35, v126
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v36, v127
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v32, v31, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v34, v33, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v36, v35, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr93
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr95
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr105
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr107
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr109
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr111
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr106
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr122
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr121
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr120
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr125
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr124
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr126
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr127
+; GFX11-FAKE16-NEXT:  .LBB44_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB44_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v134, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v118, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v131, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v116, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v129, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, v35, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v126, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v127, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v125, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v124, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v98, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v116, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v98, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v112, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v99, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v103, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v120, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v122, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v121, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v99, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v111, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v81, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v81, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v101, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v97, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v83, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v106, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v110, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v109, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v83, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v108, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v107, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v86, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v80, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v68, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v69, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v105, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v94, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v104, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v95, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v68, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v93, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v64, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v64, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v42, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v183, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v180, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v88, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v92, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v91, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v65, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v90, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v89, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v69, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v179, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v165, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v80, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v177, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v163, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v166, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v79, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v76, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v78, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v85, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v75, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v145, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v97, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v151, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v148, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v178, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v164, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v61, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v74, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v73, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v101, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v72, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v63, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v103, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v162, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v146, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v149, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v144, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v147, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v62, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v60, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v59, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v118, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v58, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v119, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v119, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v135, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v130, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v133, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v117, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v44, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v57, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v56, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v117, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v46, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v45, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v129, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v114, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v96, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v96, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v102, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v55, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v43, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v41, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v40, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v55, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v181, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v37, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v52, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v53, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v50, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v150, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v176, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v167, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v50, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v161, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v160, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v52, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v51, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v49, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v48, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v36, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v132, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v128, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v115, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v100, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v113, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v39, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, v32, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v71, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v82, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v70, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v66, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, 0x300, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, 0x300, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v38, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, 0x300, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v39, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v33, v38, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v39, v34, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v52, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v50, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v96, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v129, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v117, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v119, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v118, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v112, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v103, v16, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v101, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v97, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v85, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v80, v20, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v69, v21, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v65, v22, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v64, v23, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v68, v24, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v67, v25, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v86, v26, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v83, v27, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v81, v28, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v99, v29, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v98, v30, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
+; GFX11-FAKE16-NEXT:  .LBB44_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
+; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:580
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -75313,1179 +84354,2445 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v128i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x15
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:96
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:92
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:88
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:84
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:80
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:76
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:72
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:68
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:64
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:60
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:12
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB45_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v76, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 8, v17
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[19:20]
-; GFX11-NEXT:    v_lshrrev_b64 v[67:68], 24, v[17:18]
-; GFX11-NEXT:  .LBB45_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB45_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; GFX11-NEXT:    v_add_f32_e32 v35, 0x40c00000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v33, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v33
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add3_u32 v17, v17, v33, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v37, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_add3_u32 v37, v37, v18, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
-; GFX11-NEXT:    v_perm_b32 v69, v77, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v38, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v34
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 24, v69
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v38, v38, v34, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 8, v69
-; GFX11-NEXT:    v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20
-; GFX11-NEXT:    v_bfe_u32 v48, v35, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v35
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v37
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v36, v48, v35, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v35, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v36, v39, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
-; GFX11-NEXT:    v_add3_u32 v35, v35, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_perm_b32 v68, v34, v33, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v34, v20, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v35, v37, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22
-; GFX11-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; GFX11-NEXT:    v_bfe_u32 v35, v38, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v68
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 8, v68
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v39, v36, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_add3_u32 v35, v35, v38, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v65, v19, v18, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v20, v39, v36, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 24, v65
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v20, v34, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v37
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v34
-; GFX11-NEXT:    v_bfe_u32 v36, v20, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v35, v39, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
-; GFX11-NEXT:    v_add3_u32 v36, v36, v20, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_perm_b32 v64, v35, v34, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 8, v65
-; GFX11-NEXT:    v_bfe_u32 v48, v37, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v36, v38, vcc_lo
-; GFX11-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 16, v64
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v64
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[64:65]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v35, v22, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v22
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v24
-; GFX11-NEXT:    v_bfe_u32 v36, v39, 16, 1
-; GFX11-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v22, v48, v37, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v39
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_add3_u32 v36, v36, v39, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v35, v21, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v37
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v71, v21, v20, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v21
-; GFX11-NEXT:    v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v35
-; GFX11-NEXT:    v_bfe_u32 v37, v22, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v48, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v22
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
-; GFX11-NEXT:    v_add3_u32 v37, v37, v22, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v70, v36, v35, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v49, v38, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26
-; GFX11-NEXT:    v_bfe_u32 v36, v24, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v24
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_bfe_u32 v37, v48, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_add3_u32 v36, v36, v24, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v24, v49, v38, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v48
-; GFX11-NEXT:    v_add3_u32 v37, v37, v48, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 16, v70
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 16, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v24, v36, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v24, 0x40c00000, v39
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_perm_b32 v81, v23, v22, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v38, v24, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v37, v49, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v24
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v38, v38, v24, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v80, v37, v36, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v37, v26, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v49, 0x40c00000, v25
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v26
-; GFX11-NEXT:    v_bfe_u32 v50, v39, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v38, v48, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v37, v37, v26, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v38, v49, 16, 1
-; GFX11-NEXT:    v_add3_u32 v26, v50, v39, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v28
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v49
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v37, v25, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v39
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_add3_u32 v38, v38, v49, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 16, v80
-; GFX11-NEXT:    v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_bfe_u32 v39, v26, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v38, v38, v50, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v26
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_add3_u32 v39, v39, v26, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v82, v38, v37, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v38, v28, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v50, 0x40c00000, v27
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v28
-; GFX11-NEXT:    v_bfe_u32 v51, v48, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v39, v49, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v38, v38, v28, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_bfe_u32 v39, v50, 16, 1
-; GFX11-NEXT:    v_add3_u32 v28, v51, v48, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v30
-; GFX11-NEXT:    v_or_b32_e32 v51, 0x400000, v50
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v38, v27, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v48
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_add3_u32 v39, v39, v50, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_perm_b32 v83, v25, v24, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v38, v28, v38, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v49
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v48, v28, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v39, v51, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v28
-; GFX11-NEXT:    v_add_f32_e32 v49, 0x40c00000, v49
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_add3_u32 v48, v48, v28, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v84, v39, v38, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v39, v30, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v51, 0x40c00000, v29
-; GFX11-NEXT:    v_or_b32_e32 v29, 0x400000, v30
-; GFX11-NEXT:    v_bfe_u32 v52, v49, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v48, v50, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v39, v39, v30, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_bfe_u32 v48, v51, 16, 1
-; GFX11-NEXT:    v_add3_u32 v30, v52, v49, 0x7fff
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v32
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v51
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v39, v29, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v49
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-NEXT:    v_add3_u32 v48, v48, v51, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 24, v83
-; GFX11-NEXT:    v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-NEXT:    v_bfe_u32 v49, v30, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v48, v52, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v51, 0x400000, v30
-; GFX11-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_add3_u32 v49, v49, v30, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v86, v48, v39, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v48, v32, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v52, 0x40c00000, v31
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v32
-; GFX11-NEXT:    v_bfe_u32 v53, v50, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v49, v51, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v48, v48, v32, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_bfe_u32 v49, v52, 16, 1
-; GFX11-NEXT:    v_add3_u32 v32, v53, v50, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v52
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v48, v31, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v50
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_add3_u32 v49, v49, v52, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_perm_b32 v85, v27, v26, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v32, v48, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v32, 0x40c00000, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v50, v32, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v49, v49, v53, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v32
-; GFX11-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_add3_u32 v50, v50, v32, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v96, v49, v48, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v49, v2, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v53, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v55, v51, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v50, v52, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v49, v49, v2, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v32, v53, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v51
-; GFX11-NEXT:    v_perm_b32 v87, v29, v28, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v49, v54, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v49, v55, v51, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-NEXT:    v_add3_u32 v52, v32, v53, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v53
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v49, v50, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v49, 0x40c00000, v54
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v51, v49, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v50, v52, v55, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v49
-; GFX11-NEXT:    v_add3_u32 v51, v51, v49, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v98, v50, v32, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v50, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v52, 0x40c00000, v52
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v4
-; GFX11-NEXT:    v_add3_u32 v50, v50, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v55, v52, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v49, v51, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v51, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v52
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 16, v98
-; GFX11-NEXT:    v_lshrrev_b32_e32 v76, 8, v98
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v50, v54, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v50, v55, v52, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    v_add3_u32 v51, v51, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_add_f32_e32 v52, 0x40c00000, v54
-; GFX11-NEXT:    v_cndmask_b32_e32 v50, v50, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v52
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v51, v55, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v51, v52, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v53, 0x40c00000, v53
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_perm_b32 v100, v3, v50, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT:    v_add3_u32 v51, v51, v52, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v66, v53, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v52, v5, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v6, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v51, v51, v54, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add3_u32 v6, v66, v53, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v53
-; GFX11-NEXT:    v_add3_u32 v52, v52, v5, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v55, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v97, v31, v30, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v103, v3, v51, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v52, v66, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v52, v6, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_perm_b32 v102, v5, v53, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT:    v_add3_u32 v52, v52, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v8
-; GFX11-NEXT:    v_bfe_u32 v67, v54, 16, 1
-; GFX11-NEXT:    v_perm_b32 v99, v2, v1, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v5, v5, v8, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v52, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_bfe_u32 v52, v7, 16, 1
-; GFX11-NEXT:    v_add3_u32 v8, v67, v54, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v54
-; GFX11-NEXT:    v_or_b32_e32 v67, 0x400000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v66, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-NEXT:    v_add3_u32 v52, v52, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v54, v8, v55, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v8, 0x40c00000, v66
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v52, v67, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v52, v8, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_perm_b32 v182, v7, v54, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v7, v10, 16, 1
-; GFX11-NEXT:    v_add3_u32 v52, v52, v8, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v67, 0x400000, v10
-; GFX11-NEXT:    v_bfe_u32 v112, v55, 16, 1
-; GFX11-NEXT:    v_perm_b32 v183, v5, v6, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v7, v7, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v52, v66, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v52, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v10, v112, v55, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v55
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v67, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-NEXT:    v_add3_u32 v52, v52, v9, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_perm_b32 v101, v4, v49, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 16, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
-; GFX11-NEXT:    v_bfe_u32 v66, v10, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11
-; GFX11-NEXT:    v_bfe_u32 v67, v12, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v176, v9, v55, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v9, 0x40c00000, v52
-; GFX11-NEXT:    v_add3_u32 v52, v66, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v10
-; GFX11-NEXT:    v_add3_u32 v67, v67, v12, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v12
-; GFX11-NEXT:    v_bfe_u32 v113, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v9
-; GFX11-NEXT:    v_add3_u32 v52, v113, v9, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v177, v7, v8, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14
-; GFX11-NEXT:    v_bfe_u32 v112, v11, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67
-; GFX11-NEXT:    v_add3_u32 v66, v112, v11, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v67, 0x400000, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v112, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_bfe_u32 v113, v52, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112
-; GFX11-NEXT:    v_add3_u32 v67, v113, v52, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v52
-; GFX11-NEXT:    v_bfe_u32 v113, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    v_bfe_u32 v114, v66, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v115, 0x400000, v66
-; GFX11-NEXT:    v_perm_b32 v162, v11, v9, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v52, v67, v112, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v67, v113, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v14
-; GFX11-NEXT:    v_add3_u32 v113, v114, v66, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v114, 16, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_bfe_u32 v116, v13, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-NEXT:    v_add3_u32 v112, v116, v13, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v149, v14, v52, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v114, v67, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v66, v113, v115, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v113, 0x400000, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v115, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_perm_b32 v163, v12, v10, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115
-; GFX11-NEXT:    v_add3_u32 v113, v114, v67, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v67
-; GFX11-NEXT:    v_bfe_u32 v115, v16, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-NEXT:    v_bfe_u32 v116, v112, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v117, 0x400000, v112
-; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v15
-; GFX11-NEXT:    v_perm_b32 v148, v13, v66, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v67, v113, v114, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v114, v115, v16, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v115, 0x400000, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_bfe_u32 v113, v15, 16, 1
-; GFX11-NEXT:    v_add3_u32 v116, v116, v112, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v33
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[96:97]
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v114, v115, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v112, v112
-; GFX11-NEXT:    v_add3_u32 v113, v113, v15, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[86:87]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[84:85]
-; GFX11-NEXT:    v_perm_b32 v135, v16, v67, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v112, v116, v117, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v52
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v53
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v113, v118, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX11-NEXT:    v_perm_b32 v134, v15, v112, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v112
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v48
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v51
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[134:135]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[148:149]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[162:163]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[176:177]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v37
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[182:183]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[82:83]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v67
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v66
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v54
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[102:103]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[98:99]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[80:81]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v39
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[100:101]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[70:71]
-; GFX11-NEXT:    v_lshrrev_b64 v[67:68], 24, v[68:69]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 24, v135
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 8, v135
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 16, v134
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v134
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 24, v149
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 8, v149
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v148
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 8, v148
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 24, v163
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 8, v163
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 16, v162
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 8, v162
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 24, v177
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v177
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 16, v176
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 8, v176
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 24, v183
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 8, v183
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 16, v182
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v182
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 24, v103
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 8, v103
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 16, v102
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 8, v102
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 24, v101
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 8, v101
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 16, v100
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 8, v100
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 24, v99
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 8, v99
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 24, v97
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v96
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 24, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v86
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 24, v85
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 16, v84
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 16, v82
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 24, v81
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 24, v71
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v77
-; GFX11-NEXT:  .LBB45_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v76
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v63
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v75
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v66, 0xff, v60
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v39, v55, v39
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v73
-; GFX11-NEXT:    v_or_b32_e32 v54, v66, v54
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v56
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v166
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v39
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v55, v55, v65
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v62
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v52
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v65
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v58
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v160
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v51
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v54, v65, v66
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v45
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v44
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v55
-; GFX11-NEXT:    v_or_b32_e32 v53, v65, v53
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v42
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v147
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v41
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v39
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v54
-; GFX11-NEXT:    v_or_b32_e32 v53, v55, v65
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v183
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v181
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v54
-; GFX11-NEXT:    v_or_b32_e32 v52, v55, v52
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v180
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xff, v132
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v179
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v52
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v53
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v52, v54, v55
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xff, v167
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v177
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v165
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_or_b32_e32 v51, v54, v51
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v39
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v52
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v51
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xff, v119
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v164
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v162
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xff, v161
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v150
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v118
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v149
-; GFX11-NEXT:    v_or_b32_e32 v51, v51, v52
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v53
-; GFX11-NEXT:    v_or_b32_e32 v50, v54, v50
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v55
-; GFX11-NEXT:    v_or_b32_e32 v52, v65, v66
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    v_or_b32_e32 v1, v9, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v10, v51
-; GFX11-NEXT:    v_or_b32_e32 v3, v11, v50
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v52
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v148
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v145
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v144
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v116
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v135
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v134
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v131
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v130
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v114
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v129
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v74
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v72
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v39, v48
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v61
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v69
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v59
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v57
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v47
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v46
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v146
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v43
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v21
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v70
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v40
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v71
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v133
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v182
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v38, v39
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v178
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v37
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v81
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v128
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v176
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v82
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v163
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v83
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v117
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v151
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v103
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v35
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v36, v35
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v115
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v102
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v86
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v101
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v87
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v113
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v100
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b16 v28, 8, v96
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v99
-; GFX11-NEXT:    v_lshlrev_b16 v30, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v97
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v112
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v98
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v26, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v27, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v28, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v29, v33, v34
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-NEXT:    s_clause 0x15
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:96
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v128i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:32
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:12
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr107_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr106_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr95_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr93_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr75_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr63_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr62_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr74_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr57_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr56_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr89_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr59_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr60_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr94_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr104_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr92_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr79_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr73_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr61_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB45_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v132.h, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v162.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v164.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v165.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v46.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v74.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v45.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v42.h, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v89.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v59.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v60.h, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v94.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v77.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v76.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v104.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v91.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v92.h, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v71.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.h, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v101.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v100.h, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v102.h, v30.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v116.h, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.h, v32.h
+; GFX11-TRUE16-NEXT:  .LBB45_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB45_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v55.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v36, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v50, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v37, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v48, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v80.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v33, v70
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v34, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v81.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v83, v33, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v83.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v20, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v33, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v82, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v34, v71
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v82.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_cndmask_b32 v84, v19, v39
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v22, v84
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v85.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v35, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v86, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v86.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v96.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v24, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v33, v87
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v36, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v176, 8, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v101, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v97.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v26, v38 :: v_dual_add_f32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v101.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v33, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v99, v26, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v34, v98
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v100, v25, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v99.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v28, v100
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v112, v34, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v112.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v35, v27
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v103.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v113, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v30, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v113.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v30, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v33, v102
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v36, v29
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v29
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v114, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v115, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v114.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v116, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v116.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v32, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v31, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v33, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v131, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v32, 0xffff, v34, v115
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v132, v31, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v2.l, v131.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 24, v32
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v144, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v132
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v147, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v144.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v149, v33, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v35, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v162, v34, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v36.l, v162.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v36, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v33.l, v147.h
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v38 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 8, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v33, v149
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 24, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v164, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v165, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v34.l, v164.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v180, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v37.l, v180.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v6, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v33, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v178, v6, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v34, v165
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v179, v5, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v37, v36
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v8.l, v178.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v46, v35, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v179
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 8, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v34, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v45, v7, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v46.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v42, v35, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v45.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v37, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v35, v42
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v59, v38, v50, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v48, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v59.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v74, v35, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v60, v48, v52 :: v_dual_add_f32 v37, 0x40c00000, v51
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v14, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v60
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v36, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v74.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v35, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v12
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v36, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v89, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v48, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v76, v37, v38 :: v_dual_and_b32 v37, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v49, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v77, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v49, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v77.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v39, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v94, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v37, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v48, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v91, v13, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v50, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v94.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v104, v39, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v89.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v38, v76
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v92, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v104.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v91.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v39, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 24, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v14
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v35, v92
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v37, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v34, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v167, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v7
+; GFX11-TRUE16-NEXT:  .LBB45_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v108.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v132.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v106.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v107.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v78.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v105.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v162.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v4, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v95.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v93.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v5.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v180.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v90.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v164.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v88.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v165.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v8, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v46.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v75.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v8, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v179.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v62.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v74.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v10, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v57.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v45.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v10, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v89.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v40.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v12, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v42.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v43.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v60.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v181.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v94.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v167.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v59.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v182.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v77.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v20, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v76.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v104.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v14
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v91.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v92.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v79.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v72.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v16, v18
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v47.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v44.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v18, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v177.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v41.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v183.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v176.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v166.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v101.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v97.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v24, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v112.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v103.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v28, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v102.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    s_clause 0x4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v128i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB45_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-FAKE16-NEXT:  .LBB45_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB45_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v69, v77, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v69
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v69
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v48, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v36, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v68, v34, v33, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v35, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v34, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v68
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v68
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v34, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v36, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v38, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v65, v19, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v39, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 16, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 24, v65
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v20, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v34
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v64, v35, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v65
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v36, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 16, v64
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v64
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[64:65]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v48, v37, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v35, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v71, v21, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v21
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v70, v36, v35, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v36, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v49, v38, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 16, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 16, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v24, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_perm_b32 v81, v23, v22, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v80, v37, v36, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v38, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v26, v50, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v37, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 16, v80
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v38, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v82, v38, v37, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v39, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v38, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v28, v51, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v38, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v50, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v83, v25, v24, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v28, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v49
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v49
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v84, v39, v38, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v48, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v52, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v39, v29, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v51, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 24, v83
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v48, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v86, v48, v39, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v53, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v49, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v48, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v53, v50, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v48, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v85, v27, v26, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v32, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v50, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v96, v49, v48, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v50, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v49, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_perm_b32 v87, v29, v28, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v49, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v55, v51, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v32, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v49, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v54
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v52, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v51, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v98, v50, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v52
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v50, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v51, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 16, v98
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v98
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v50, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v55, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v51, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v54
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v51, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v53
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v100, v3, v50, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v51, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v66, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v53
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v97, v31, v30, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v103, v3, v51, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v52, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v54
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v102, v5, v53, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v67, v54, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v99, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v52, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v67, v54, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v8, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v66
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v52, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v182, v7, v54, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v112, v55, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v183, v5, v6, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v52, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v112, v55, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v52, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v101, v4, v49, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 16, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v67, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v176, v9, v55, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v52
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v66, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v67, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v113, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v177, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v112, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67
+; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v112, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v113, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v114, v66, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v115, 0x400000, v66
+; GFX11-FAKE16-NEXT:    v_perm_b32 v162, v11, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v67, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v113, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_add3_u32 v113, v114, v66, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v116, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    v_add3_u32 v112, v116, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v149, v14, v52, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v114, v67, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v66, v113, v115, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v113, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_perm_b32 v163, v12, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v17
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115
+; GFX11-FAKE16-NEXT:    v_add3_u32 v113, v114, v67, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v114, 0x400000, v67
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v115, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v116, v112, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v117, 0x400000, v112
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v118, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_perm_b32 v148, v13, v66, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v113, v114, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v114, v115, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v115, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v113, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v116, v116, v112, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[96:97]
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v114, v115, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v112, v112
+; GFX11-FAKE16-NEXT:    v_add3_u32 v113, v113, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[86:87]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[84:85]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v135, v16, v67, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v112, v116, v117, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v52
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v53
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v113, v118, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX11-FAKE16-NEXT:    v_perm_b32 v134, v15, v112, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v112
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v48
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v51
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[134:135]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[148:149]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[162:163]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[176:177]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[182:183]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[82:83]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v67
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v66
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v54
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[102:103]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[98:99]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[80:81]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[100:101]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[70:71]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[68:69]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v135
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v135
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 16, v134
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v134
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v149
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v149
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v148
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v148
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v163
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v163
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 16, v162
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v162
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 24, v177
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v177
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 16, v176
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 8, v176
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 24, v183
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 8, v183
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v182
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v182
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v103
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v103
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 16, v102
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v102
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 24, v101
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v101
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 16, v100
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v100
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v99
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v99
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v97
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v97
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 16, v96
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v96
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v87
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v87
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 16, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v85
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 16, v84
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v84
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 16, v82
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 24, v81
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v81
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 24, v71
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v71
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v77
+; GFX11-FAKE16-NEXT:  .LBB45_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v76
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v63
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v66, 0xff, v60
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, v55, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v73
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v66, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v56
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v166
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v55, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v62
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v160
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v65, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v45
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v44
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v65, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v147
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v41
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v55, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v55, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v180
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v132
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v179
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v54, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v167
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v54, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xff, v119
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v164
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v162
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v161
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v150
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v149
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v51, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, v54, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v65, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v148
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v145
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v144
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v116
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v134
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v114
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v72
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v39, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v61
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v59
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v47
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v43
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v40
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v38, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v128
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v176
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v163
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v103
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v36, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v115
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v102
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v100
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v33, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-FAKE16-NEXT:    s_clause 0x15
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -81276,923 +91583,1689 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v128i8_to_v64f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:580
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:576
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:572
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:568
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:564
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:560
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:556
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:552
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:548
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:544
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:540
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:536
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:532
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:528
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:524
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:520
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:516
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:512
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:508
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:504
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:500
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:496
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:492
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:488
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:484
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:480
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:476
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:472
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:468
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:464
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:460
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:452
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:448
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:444
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:440
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:436
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:432
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:428
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:424
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:420
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:416
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:412
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:408
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:404
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:400
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:396
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:392
-; GFX11-NEXT:    v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24
-; GFX11-NEXT:    v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26
-; GFX11-NEXT:    v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20
-; GFX11-NEXT:    v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16
-; GFX11-NEXT:    v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4
-; GFX11-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:380
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-NEXT:    scratch_load_u16 v134, off, s32 offset:372
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:364
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:356
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:348
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:340
-; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:332
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:324
-; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:316
-; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:308
-; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:300
-; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:292
-; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:260
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:252
-; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_u16 v88, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_u16 v93, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_u16 v91, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v150, off, s32 offset:388
-; GFX11-NEXT:    scratch_load_u16 v182, off, s32
-; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v61, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v63, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v72, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v73, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:152
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_u16 v79, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_u16 v89, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_u16 v90, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_u16 v95, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_u16 v104, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_u16 v105, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_u16 v42, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_u16 v177, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v70, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v82, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v128, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v132, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v161, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v160, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v176, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v167, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v181, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-NEXT:    v_lshlrev_b16 v127, 8, v0
-; GFX11-NEXT:    v_lshlrev_b16 v126, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v124, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v125, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v120, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v123, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v121, 8, v12
-; GFX11-NEXT:    v_lshlrev_b16 v122, 8, v14
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    v_lshlrev_b16 v106, 8, v16
-; GFX11-NEXT:    v_lshlrev_b16 v111, 8, v18
-; GFX11-NEXT:    v_lshlrev_b16 v109, 8, v20
-; GFX11-NEXT:    v_lshlrev_b16 v110, 8, v22
-; GFX11-NEXT:    v_lshlrev_b16 v107, 8, v24
-; GFX11-NEXT:    v_lshlrev_b16 v108, 8, v26
-; GFX11-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-NEXT:    v_lshlrev_b16 v88, 8, v88
-; GFX11-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-NEXT:    v_lshlrev_b16 v93, 8, v93
-; GFX11-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-NEXT:    v_lshlrev_b16 v91, 8, v91
-; GFX11-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-NEXT:    v_lshlrev_b16 v92, 8, v92
-; GFX11-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
-; GFX11-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-NEXT:    v_lshlrev_b16 v150, 8, v182
-; GFX11-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-NEXT:    v_lshlrev_b16 v41, 8, v40
-; GFX11-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-NEXT:    v_lshlrev_b16 v40, 8, v43
-; GFX11-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-NEXT:    v_lshlrev_b16 v43, 8, v44
-; GFX11-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-NEXT:    v_lshlrev_b16 v182, 8, v45
-; GFX11-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-NEXT:    v_lshlrev_b16 v46, 8, v46
-; GFX11-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-NEXT:    v_lshlrev_b16 v45, 8, v47
-; GFX11-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-NEXT:    v_lshlrev_b16 v57, 8, v56
-; GFX11-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-NEXT:    v_lshlrev_b16 v56, 8, v58
-; GFX11-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-NEXT:    v_lshlrev_b16 v58, 8, v59
-; GFX11-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-NEXT:    v_lshlrev_b16 v44, 8, v60
-; GFX11-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-NEXT:    v_lshlrev_b16 v60, 8, v61
-; GFX11-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-NEXT:    v_lshlrev_b16 v59, 8, v62
-; GFX11-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-NEXT:    v_lshlrev_b16 v62, 8, v63
-; GFX11-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-NEXT:    v_lshlrev_b16 v47, 8, v72
-; GFX11-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-NEXT:    v_lshlrev_b16 v72, 8, v73
-; GFX11-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-NEXT:    v_lshlrev_b16 v63, 8, v74
-; GFX11-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-NEXT:    v_lshlrev_b16 v74, 8, v75
-; GFX11-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-NEXT:    v_lshlrev_b16 v73, 8, v76
-; GFX11-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-NEXT:    v_lshlrev_b16 v75, 8, v77
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v61, 8, v78
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshlrev_b16 v78, 8, v79
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v77, 8, v89
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b16 v79, 8, v90
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v76, 8, v95
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b16 v90, 8, v104
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v89, 8, v105
-; GFX11-NEXT:    v_lshlrev_b16 v104, 8, v94
-; GFX11-NEXT:    v_lshlrev_b16 v95, 8, v31
-; GFX11-NEXT:    v_lshlrev_b16 v105, 8, v30
-; GFX11-NEXT:    v_lshlrev_b16 v94, 8, v28
-; GFX11-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB46_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v48
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v84
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v51
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v34
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v115
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v66
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v128
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v113
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v132
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v100
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v161
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v160
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v176
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v167
-; GFX11-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v11, v10, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v37
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v102
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v114
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v96
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v133
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v135
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v130
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v181
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v150
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v41
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v40
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v43
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v182
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v46
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v45
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v57
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v56
-; GFX11-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v14, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v16, v15, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v147
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v119
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v149
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v144
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v162
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v146
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v178
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v164
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v151
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v148
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v58
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v44
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v60
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v59
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v62
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v47
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v72
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v63
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v74
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v73
-; GFX11-NEXT:    v_perm_b32 v12, v13, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v15, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v17, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v19, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v16, v21, v20, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v166
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v145
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v177
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v163
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v179
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v165
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v183
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v180
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v42
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v65
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v75
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v61
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v78
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v77
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v79
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v76
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v90
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v89
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v92
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v91
-; GFX11-NEXT:    v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v18, v20, v19, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v19, v22, v21, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v20, v24, v23, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v21, v26, v25, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v83
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v101
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v93
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v88
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v104
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v95
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v105
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v94
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v108
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v107
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v110
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v109
-; GFX11-NEXT:    v_perm_b32 v22, v23, v22, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v23, v25, v24, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v24, v27, v26, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v25, v29, v28, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v26, v31, v30, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v103
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v112
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v99
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v129
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v98
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v131
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v116
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v134
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v118
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v111
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v106
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v122
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v121
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v123
-; GFX11-NEXT:    v_or_b32_e32 v32, v32, v120
-; GFX11-NEXT:    v_or_b32_e32 v33, v33, v125
-; GFX11-NEXT:    v_or_b32_e32 v34, v34, v124
-; GFX11-NEXT:    v_or_b32_e32 v35, v35, v126
-; GFX11-NEXT:    v_or_b32_e32 v36, v36, v127
-; GFX11-NEXT:    v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v28, v30, v29, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v29, v32, v31, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v30, v34, v33, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v31, v36, v35, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr77
-; GFX11-NEXT:    ; implicit-def: $vgpr79
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr90
-; GFX11-NEXT:    ; implicit-def: $vgpr89
-; GFX11-NEXT:    ; implicit-def: $vgpr92
-; GFX11-NEXT:    ; implicit-def: $vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr93
-; GFX11-NEXT:    ; implicit-def: $vgpr88
-; GFX11-NEXT:    ; implicit-def: $vgpr104
-; GFX11-NEXT:    ; implicit-def: $vgpr95
-; GFX11-NEXT:    ; implicit-def: $vgpr105
-; GFX11-NEXT:    ; implicit-def: $vgpr94
-; GFX11-NEXT:    ; implicit-def: $vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr107
-; GFX11-NEXT:    ; implicit-def: $vgpr110
-; GFX11-NEXT:    ; implicit-def: $vgpr109
-; GFX11-NEXT:    ; implicit-def: $vgpr111
-; GFX11-NEXT:    ; implicit-def: $vgpr106
-; GFX11-NEXT:    ; implicit-def: $vgpr122
-; GFX11-NEXT:    ; implicit-def: $vgpr121
-; GFX11-NEXT:    ; implicit-def: $vgpr123
-; GFX11-NEXT:    ; implicit-def: $vgpr120
-; GFX11-NEXT:    ; implicit-def: $vgpr125
-; GFX11-NEXT:    ; implicit-def: $vgpr124
-; GFX11-NEXT:    ; implicit-def: $vgpr126
-; GFX11-NEXT:    ; implicit-def: $vgpr127
-; GFX11-NEXT:  .LBB46_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB46_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v134, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v118, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v131, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v116, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v129, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v35, v35, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v126, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v127, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v125, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v124, v3
-; GFX11-NEXT:    v_add_nc_u16 v33, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v98, 3
-; GFX11-NEXT:    v_add_nc_u16 v116, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v98, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v1, v112, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u16 v3, v99, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v103, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v123, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v120, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v122, v1
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v121, v3
-; GFX11-NEXT:    v_add_nc_u16 v99, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v111, v4
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v81, 3
-; GFX11-NEXT:    v_add_nc_u16 v81, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v101, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v86, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v97, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v83, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v106, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v110, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v109, v0
-; GFX11-NEXT:    v_add_nc_u16 v83, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v108, v3
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v107, v4
-; GFX11-NEXT:    v_add_nc_u16 v86, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v67, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v80, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v68, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v69, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v105, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v94, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v104, v2
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v95, v3
-; GFX11-NEXT:    v_add_nc_u16 v68, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v93, v4
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v64, 3
-; GFX11-NEXT:    v_add_nc_u16 v64, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_add_nc_u16 v0, v42, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v65, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_add_nc_u16 v3, v183, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v180, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v88, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v91, v1
-; GFX11-NEXT:    v_add_nc_u16 v65, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v90, v3
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v89, v4
-; GFX11-NEXT:    v_add_nc_u16 v69, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_add_nc_u16 v1, v179, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v165, 3
-; GFX11-NEXT:    v_add_nc_u16 v80, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_add_nc_u16 v0, v177, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v163, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_add_nc_u16 v4, v166, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v79, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v76, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v78, v0
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-NEXT:    v_add_nc_u16 v85, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v75, v4
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v145, 3
-; GFX11-NEXT:    v_add_nc_u16 v97, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_add_nc_u16 v1, v151, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v148, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_add_nc_u16 v3, v178, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v164, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v61, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v74, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v73, v2
-; GFX11-NEXT:    v_add_nc_u16 v101, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v63, v4
-; GFX11-NEXT:    v_add_nc_u16 v103, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_add_nc_u16 v2, v162, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v146, 3
-; GFX11-NEXT:    v_add_nc_u16 v112, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_add_nc_u16 v1, v149, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v144, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u16 v4, v147, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v62, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v47, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v60, v1
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v59, v3
-; GFX11-NEXT:    v_add_nc_u16 v118, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v58, v4
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v119, 3
-; GFX11-NEXT:    v_add_nc_u16 v119, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_nc_u16 v2, v135, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v130, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_nc_u16 v3, v133, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v117, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v44, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v57, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-NEXT:    v_add_nc_u16 v117, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v46, v3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v45, v4
-; GFX11-NEXT:    v_add_nc_u16 v129, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v0, v114, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v96, 3
-; GFX11-NEXT:    v_add_nc_u16 v96, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v2, v102, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v87, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v55, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v43, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v182, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v41, v2
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v40, v3
-; GFX11-NEXT:    v_add_nc_u16 v55, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v181, v4
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v37, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v52, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v53, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v50, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v150, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v176, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v167, v1
-; GFX11-NEXT:    v_add_nc_u16 v50, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v161, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v160, v4
-; GFX11-NEXT:    v_add_nc_u16 v52, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v51, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v49, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v48, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v36, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v132, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v128, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v115, v0
-; GFX11-NEXT:    v_add_nc_u16 v34, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v100, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v113, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v39, 3
-; GFX11-NEXT:    v_add_nc_u16 v36, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v32, v32, 3
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-NEXT:    v_or_b32_e32 v35, v71, v35
-; GFX11-NEXT:    v_or_b32_e32 v33, v82, v33
-; GFX11-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-NEXT:    v_or_b32_e32 v36, v70, v36
-; GFX11-NEXT:    v_or_b32_e32 v32, v66, v32
-; GFX11-NEXT:    v_add_nc_u16 v35, 0x300, v35
-; GFX11-NEXT:    v_add_nc_u16 v33, 0x300, v33
-; GFX11-NEXT:    v_add_nc_u16 v38, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v36
-; GFX11-NEXT:    v_add_nc_u16 v32, 0x300, v32
-; GFX11-NEXT:    v_add_nc_u16 v36, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v39, 0x300, v1
-; GFX11-NEXT:    v_perm_b32 v1, v33, v38, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v32, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v39, v34, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v52, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v50, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v37, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v96, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v129, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v117, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v119, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v118, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v112, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v16, v103, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v17, v101, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v18, v97, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v19, v85, v19, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v20, v80, v20, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v21, v69, v21, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v24, v68, v24, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v25, v67, v25, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v26, v86, v26, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v27, v83, v27, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v28, v81, v28, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v29, v99, v29, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v30, v98, v30, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
-; GFX11-NEXT:  .LBB46_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:392
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:396
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:400
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:404
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:408
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:412
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:416
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:420
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:424
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:428
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:432
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:436
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:440
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:444
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:448
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:452
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:456
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:460
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:464
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:468
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:472
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:476
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:480
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:484
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:488
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:492
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:496
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:500
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:504
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:508
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:512
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:520
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:524
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:528
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:532
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:536
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:540
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:544
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:548
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:552
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:556
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:560
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:564
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:568
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:572
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:576
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:580
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v117, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v118, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v119, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v128, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v132, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB46_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB46_4
+; GFX11-TRUE16-NEXT:  .LBB46_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB46_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v85.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v100.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v113.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v115.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v118.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v114.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v119.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v128.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v117.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v130.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v129.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v133.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v133.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v144.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v134.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v145.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v146.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v146.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v148.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v149.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v149.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v147.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v150.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v151.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB46_2
+; GFX11-TRUE16-NEXT:  .LBB46_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v50.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v147.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v148.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v145.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v146.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v145.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v135.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v144.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v134.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v135.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v132.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v133.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v134.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v132.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v130.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v117.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v128.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v129.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v119.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v128.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v119.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v114.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v117.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v114.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v116.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v116.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v115.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v113.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v112.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v65.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v112.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v102.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v103.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v102.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v99.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v87.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v98.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v97.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v70.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v82.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v83.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v81.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v32.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
+; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:392
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v150, off, s32 offset:388
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v61, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v63, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v72, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v73, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v177, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v70, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v132, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v161, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v160, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v176, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v167, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v150, 8, v182
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v41, 8, v40
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v40, 8, v43
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v43, 8, v44
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v45
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v46
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v47
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v56
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v58
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v58, 8, v59
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v44, 8, v60
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v60, 8, v61
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v59, 8, v62
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v63
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v72
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v73
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v74
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v74, 8, v75
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v76
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v75, 8, v77
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v78
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v78, 8, v79
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v77, 8, v89
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v90
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v76, 8, v95
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v104
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v105
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v94
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB46_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v128
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v132
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v161
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v160
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v176
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v167
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v9, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v37
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v102
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v130
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v150
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v41
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v40
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v43
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v57
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v56
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v14, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v16, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v144
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v148
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v58
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v44
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v60
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v59
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v62
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v47
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v72
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v74
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v73
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v13, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v15, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v17, v16, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v19, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v21, v20, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v166
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v75
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v61
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v78
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v77
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v79
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v76
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v90
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v89
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v92
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v91
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v93
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v88
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v104
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v95
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v105
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v94
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v108
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v107
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v110
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v109
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v23, v22, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v25, v24, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v27, v26, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v29, v28, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v31, v30, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v111
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v106
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v122
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v121
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v123
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v32, v120
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v33, v125
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v34, v124
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v35, v126
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v36, v127
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v32, v31, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v34, v33, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v36, v35, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr93
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr95
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr105
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr107
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr109
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr111
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr106
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr122
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr121
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr120
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr125
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr124
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr126
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr127
+; GFX11-FAKE16-NEXT:  .LBB46_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB46_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v134, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v118, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v131, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v116, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v129, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, v35, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v126, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v127, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v125, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v124, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v98, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v116, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v98, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v112, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v99, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v103, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v120, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v122, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v121, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v99, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v111, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v81, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v81, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v101, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v97, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v83, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v106, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v110, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v109, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v83, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v108, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v107, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v86, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v80, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v68, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v69, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v105, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v94, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v104, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v95, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v68, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v93, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v64, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v64, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v42, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v183, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v180, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v88, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v92, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v91, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v65, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v90, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v89, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v69, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v179, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v165, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v80, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v177, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v163, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v166, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v79, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v76, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v78, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v85, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v75, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v145, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v97, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v151, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v148, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v178, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v164, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v61, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v74, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v73, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v101, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v72, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v63, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v103, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v162, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v146, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v149, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v144, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v147, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v62, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v60, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v59, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v118, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v58, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v119, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v119, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v135, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v130, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v133, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v117, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v44, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v57, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v56, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v117, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v46, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v45, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v129, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v114, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v96, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v96, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v102, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v55, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v43, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v41, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v40, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v55, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v181, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v37, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v52, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v53, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v50, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v150, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v176, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v167, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v50, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v161, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v160, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v52, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v51, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v49, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v48, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v36, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v132, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v128, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v115, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v100, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v113, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v39, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, v32, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v71, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v82, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v70, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v66, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, 0x300, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, 0x300, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v38, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, 0x300, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v39, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v33, v38, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v39, v34, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v52, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v50, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v96, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v129, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v117, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v119, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v118, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v112, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v103, v16, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v101, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v97, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v85, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v80, v20, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v69, v21, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v65, v22, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v64, v23, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v68, v24, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v67, v25, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v86, v26, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v83, v27, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v81, v28, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v99, v29, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v98, v30, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
+; GFX11-FAKE16-NEXT:  .LBB46_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
+; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:580
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -86308,692 +97381,1256 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64f16_to_v128i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:88
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:84
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:80
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:76
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:72
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:68
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:64
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:60
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:12
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB47_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[19:20]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v17
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[17:18]
-; GFX11-NEXT:  .LBB47_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB47_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
-; GFX11-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[64:65], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-NEXT:    v_lshrrev_b64 v[53:54], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
-; GFX11-NEXT:    v_lshrrev_b64 v[65:66], 24, v[19:20]
-; GFX11-NEXT:    v_lshrrev_b64 v[54:55], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
-; GFX11-NEXT:    v_lshrrev_b64 v[66:67], 24, v[17:18]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v102, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v116, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v119, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v129, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v131, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v133, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v144, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v150, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v151, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v161, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v164, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v165, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v167, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v177, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v179, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v181, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v183, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v42, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v45, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v47, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v57, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v60, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v62, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v72, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v74, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 24, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 8, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v101, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v103, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v112, 24, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v113, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v114, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v118, 16, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v128, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v130, 24, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v132, 16, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v134, 8, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v135, 16, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 8, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v148, 24, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v149, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v160, 8, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v162, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v163, 8, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v166, 24, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v176, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v178, 8, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v180, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v182, 8, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v40, 24, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v41, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v44, 8, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v46, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v56, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v58, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v73, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v75, 8, v17
-; GFX11-NEXT:  .LBB47_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v74
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v60
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v72
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v67, 0xff, v47
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v39, v55, v39
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v63
-; GFX11-NEXT:    v_or_b32_e32 v54, v67, v54
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v42
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v62
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v39
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v55, v55, v64
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v57
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v52
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v64
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v45
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v43
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v51
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v54, v64, v67
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v183
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v181
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v55
-; GFX11-NEXT:    v_or_b32_e32 v53, v64, v53
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v179
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v177
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v167
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v39
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v54
-; GFX11-NEXT:    v_or_b32_e32 v53, v55, v64
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v165
-; GFX11-NEXT:    v_and_b32_e32 v55, 0xff, v164
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v54
-; GFX11-NEXT:    v_or_b32_e32 v52, v55, v52
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v161
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xff, v151
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v150
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v52
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v53
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_or_b32_e32 v52, v54, v55
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xff, v145
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v147
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v144
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_or_b32_e32 v51, v54, v51
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v39
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v52
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v51
-; GFX11-NEXT:    v_and_b32_e32 v51, 0xff, v133
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v131
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v129
-; GFX11-NEXT:    v_and_b32_e32 v54, 0xff, v119
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v117
-; GFX11-NEXT:    v_and_b32_e32 v64, 0xff, v116
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v115
-; GFX11-NEXT:    v_or_b32_e32 v51, v51, v52
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v53
-; GFX11-NEXT:    v_or_b32_e32 v50, v54, v50
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v55
-; GFX11-NEXT:    v_or_b32_e32 v52, v64, v67
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    v_or_b32_e32 v1, v9, v39
-; GFX11-NEXT:    v_or_b32_e32 v2, v10, v51
-; GFX11-NEXT:    v_or_b32_e32 v3, v11, v50
-; GFX11-NEXT:    v_or_b32_e32 v4, v12, v52
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v6, 8, v102
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v100
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v98
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v97
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v96
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v86
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v82
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v70
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v68
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v75
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v73
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v66
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v39, v48
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v61
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v59
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v58
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v19
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v56
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v46
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v65
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v44
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v41
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v40
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v21
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v182
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v180
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v178
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v176
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v166
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v38, v39
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v163
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v162
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v37
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v160
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v149
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v148
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v146
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v135
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v134
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v132
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v130
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v118
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v35
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v36, v35
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v20
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v22
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v114
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v113
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v112
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v103
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v101
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v99
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v87
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v85
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b16 v28, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v83
-; GFX11-NEXT:    v_lshlrev_b16 v30, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v81
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v71
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v69
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v26, v27, v28
-; GFX11-NEXT:    v_or_b32_e32 v27, v29, v30
-; GFX11-NEXT:    v_or_b32_e32 v28, v31, v32
-; GFX11-NEXT:    v_or_b32_e32 v29, v33, v34
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v21, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v22, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v23, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v28, v29
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
-; GFX11-NEXT:    s_clause 0x13
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:88
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64f16_to_v128i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v17
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[67:68], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[68:69], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v31
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v17
+; GFX11-TRUE16-NEXT:  .LBB47_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v163.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v161.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v1.h, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v162.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v160.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v54, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v68
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v69
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v54, v67
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v3.h, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v51, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v67
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v68
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v51, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v5.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v51, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v66
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v135.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v65
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v10.l, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v65
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v66
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v64, v65
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v51, v54
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v115.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v51.l, v12.h, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v65, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v12, v51
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v16, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v15, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v17, v18
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v38, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v17
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v20, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v22, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v24, v25
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v26, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v70.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v67
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v54, v64
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v38, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v35, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v25, v26
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v27, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, v29, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v31, v32
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v33, v24
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:12
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[17:18]
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB47_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[31:32]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[29:30]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[27:28]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[52:53], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[53:54], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[37:38], 24, v[23:24]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[19:20]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[21:22]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[66:67], 24, v[17:18]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v100, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v116, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v119, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v131, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v133, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v145, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v151, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v161, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v164, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v165, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v167, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v177, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v179, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v181, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v183, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v42, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v47, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v60, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v62, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v72, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v74, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v101, 16, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v112, 24, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v113, 16, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v118, 16, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v132, 16, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v26
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v135, 16, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v149, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v162, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v166, 24, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v176, 16, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v178, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v180, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v40, 24, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v41, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v44, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v46, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v56, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v58, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v73, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v75, 8, v17
+; GFX11-FAKE16-NEXT:  .LBB47_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v74
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v60
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v72
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v67, 0xff, v47
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, v55, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v67, v54
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v62
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, v55, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v57
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v45
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v43
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, v64, v67
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v64, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v167
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, v55, v64
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v55, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v55, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v161
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v150
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v54, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v144
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v54, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v51, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v54, 0xff, v119
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v117
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v64, 0xff, v116
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, v51, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, v54, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, v64, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v9, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v10, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v11, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v12, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v102
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v75
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v73
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v39, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v61
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v59
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v58
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v56
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v46
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v44
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v41
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v40
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v182
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v176
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v166
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v38, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v160
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v148
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v146
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v132
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v130
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v36, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v113
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v101
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v85
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v30, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v27, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v29, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v31, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v33, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v28, v29
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[21:24], off offset:112
+; GFX11-FAKE16-NEXT:    s_clause 0x13
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -91819,923 +103456,1689 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v128i8_to_v64i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:580
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:576
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:572
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:568
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:564
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:560
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:556
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:552
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:548
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:544
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:540
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:536
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:532
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:528
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:524
-; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:520
-; GFX11-NEXT:    scratch_store_b32 off, v72, s32 offset:516
-; GFX11-NEXT:    scratch_store_b32 off, v73, s32 offset:512
-; GFX11-NEXT:    scratch_store_b32 off, v74, s32 offset:508
-; GFX11-NEXT:    scratch_store_b32 off, v75, s32 offset:504
-; GFX11-NEXT:    scratch_store_b32 off, v76, s32 offset:500
-; GFX11-NEXT:    scratch_store_b32 off, v77, s32 offset:496
-; GFX11-NEXT:    scratch_store_b32 off, v78, s32 offset:492
-; GFX11-NEXT:    scratch_store_b32 off, v79, s32 offset:488
-; GFX11-NEXT:    scratch_store_b32 off, v88, s32 offset:484
-; GFX11-NEXT:    scratch_store_b32 off, v89, s32 offset:480
-; GFX11-NEXT:    scratch_store_b32 off, v90, s32 offset:476
-; GFX11-NEXT:    scratch_store_b32 off, v91, s32 offset:472
-; GFX11-NEXT:    scratch_store_b32 off, v92, s32 offset:468
-; GFX11-NEXT:    scratch_store_b32 off, v93, s32 offset:464
-; GFX11-NEXT:    scratch_store_b32 off, v94, s32 offset:460
-; GFX11-NEXT:    scratch_store_b32 off, v95, s32 offset:456
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_store_b32 off, v104, s32 offset:452
-; GFX11-NEXT:    scratch_store_b32 off, v105, s32 offset:448
-; GFX11-NEXT:    scratch_store_b32 off, v106, s32 offset:444
-; GFX11-NEXT:    scratch_store_b32 off, v107, s32 offset:440
-; GFX11-NEXT:    scratch_store_b32 off, v108, s32 offset:436
-; GFX11-NEXT:    scratch_store_b32 off, v109, s32 offset:432
-; GFX11-NEXT:    scratch_store_b32 off, v110, s32 offset:428
-; GFX11-NEXT:    scratch_store_b32 off, v111, s32 offset:424
-; GFX11-NEXT:    scratch_store_b32 off, v120, s32 offset:420
-; GFX11-NEXT:    scratch_store_b32 off, v121, s32 offset:416
-; GFX11-NEXT:    scratch_store_b32 off, v122, s32 offset:412
-; GFX11-NEXT:    scratch_store_b32 off, v123, s32 offset:408
-; GFX11-NEXT:    scratch_store_b32 off, v124, s32 offset:404
-; GFX11-NEXT:    scratch_store_b32 off, v125, s32 offset:400
-; GFX11-NEXT:    scratch_store_b32 off, v126, s32 offset:396
-; GFX11-NEXT:    scratch_store_b32 off, v127, s32 offset:392
-; GFX11-NEXT:    v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24
-; GFX11-NEXT:    v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26
-; GFX11-NEXT:    v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20
-; GFX11-NEXT:    v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16
-; GFX11-NEXT:    v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12
-; GFX11-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4
-; GFX11-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:384
-; GFX11-NEXT:    scratch_load_u16 v118, off, s32 offset:380
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:376
-; GFX11-NEXT:    scratch_load_u16 v134, off, s32 offset:372
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:368
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:364
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:360
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:356
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:352
-; GFX11-NEXT:    scratch_load_u16 v98, off, s32 offset:348
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:344
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:340
-; GFX11-NEXT:    scratch_load_u16 v12, off, s32 offset:336
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:332
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32 offset:328
-; GFX11-NEXT:    scratch_load_u16 v112, off, s32 offset:324
-; GFX11-NEXT:    scratch_load_u16 v16, off, s32 offset:320
-; GFX11-NEXT:    scratch_load_u16 v81, off, s32 offset:316
-; GFX11-NEXT:    scratch_load_u16 v18, off, s32 offset:312
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:308
-; GFX11-NEXT:    scratch_load_u16 v20, off, s32 offset:304
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:300
-; GFX11-NEXT:    scratch_load_u16 v22, off, s32 offset:296
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:292
-; GFX11-NEXT:    scratch_load_u16 v24, off, s32 offset:288
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:284
-; GFX11-NEXT:    scratch_load_u16 v26, off, s32 offset:280
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:276
-; GFX11-NEXT:    scratch_load_u16 v28, off, s32 offset:272
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:268
-; GFX11-NEXT:    scratch_load_u16 v30, off, s32 offset:264
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:260
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32 offset:256
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:252
-; GFX11-NEXT:    scratch_load_u16 v94, off, s32 offset:248
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:244
-; GFX11-NEXT:    scratch_load_u16 v88, off, s32 offset:240
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:236
-; GFX11-NEXT:    scratch_load_u16 v93, off, s32 offset:232
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:228
-; GFX11-NEXT:    scratch_load_u16 v91, off, s32 offset:224
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:220
-; GFX11-NEXT:    scratch_load_u16 v92, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v150, off, s32 offset:388
-; GFX11-NEXT:    scratch_load_u16 v182, off, s32
-; GFX11-NEXT:    scratch_load_u16 v40, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v43, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v44, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v45, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v46, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v47, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v56, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v58, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v59, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v60, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v61, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_u16 v62, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v63, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v72, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v73, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v74, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v75, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_u16 v76, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_u16 v77, off, s32 offset:152
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v78, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_u16 v79, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_u16 v89, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_u16 v90, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_u16 v95, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_u16 v104, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_u16 v105, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_u16 v42, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_u16 v180, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_u16 v183, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_u16 v165, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_u16 v179, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_u16 v163, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_u16 v177, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_u16 v145, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_u16 v166, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_u16 v148, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_u16 v151, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v164, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v178, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v146, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v162, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v144, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v149, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v119, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v147, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v135, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v133, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v70, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v82, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v128, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v132, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v161, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v160, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v176, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v167, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v181, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(62)
-; GFX11-NEXT:    v_lshlrev_b16 v127, 8, v0
-; GFX11-NEXT:    v_lshlrev_b16 v126, 8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v124, 8, v4
-; GFX11-NEXT:    v_lshlrev_b16 v125, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v120, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v123, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v121, 8, v12
-; GFX11-NEXT:    v_lshlrev_b16 v122, 8, v14
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    v_lshlrev_b16 v106, 8, v16
-; GFX11-NEXT:    v_lshlrev_b16 v111, 8, v18
-; GFX11-NEXT:    v_lshlrev_b16 v109, 8, v20
-; GFX11-NEXT:    v_lshlrev_b16 v110, 8, v22
-; GFX11-NEXT:    v_lshlrev_b16 v107, 8, v24
-; GFX11-NEXT:    v_lshlrev_b16 v108, 8, v26
-; GFX11-NEXT:    s_waitcnt vmcnt(61)
-; GFX11-NEXT:    v_lshlrev_b16 v88, 8, v88
-; GFX11-NEXT:    s_waitcnt vmcnt(59)
-; GFX11-NEXT:    v_lshlrev_b16 v93, 8, v93
-; GFX11-NEXT:    s_waitcnt vmcnt(57)
-; GFX11-NEXT:    v_lshlrev_b16 v91, 8, v91
-; GFX11-NEXT:    s_waitcnt vmcnt(55)
-; GFX11-NEXT:    v_lshlrev_b16 v92, 8, v92
-; GFX11-NEXT:    s_waitcnt vmcnt(54)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
-; GFX11-NEXT:    s_waitcnt vmcnt(53)
-; GFX11-NEXT:    v_lshlrev_b16 v150, 8, v182
-; GFX11-NEXT:    s_waitcnt vmcnt(52)
-; GFX11-NEXT:    v_lshlrev_b16 v41, 8, v40
-; GFX11-NEXT:    s_waitcnt vmcnt(51)
-; GFX11-NEXT:    v_lshlrev_b16 v40, 8, v43
-; GFX11-NEXT:    s_waitcnt vmcnt(50)
-; GFX11-NEXT:    v_lshlrev_b16 v43, 8, v44
-; GFX11-NEXT:    s_waitcnt vmcnt(49)
-; GFX11-NEXT:    v_lshlrev_b16 v182, 8, v45
-; GFX11-NEXT:    s_waitcnt vmcnt(48)
-; GFX11-NEXT:    v_lshlrev_b16 v46, 8, v46
-; GFX11-NEXT:    s_waitcnt vmcnt(47)
-; GFX11-NEXT:    v_lshlrev_b16 v45, 8, v47
-; GFX11-NEXT:    s_waitcnt vmcnt(46)
-; GFX11-NEXT:    v_lshlrev_b16 v57, 8, v56
-; GFX11-NEXT:    s_waitcnt vmcnt(45)
-; GFX11-NEXT:    v_lshlrev_b16 v56, 8, v58
-; GFX11-NEXT:    s_waitcnt vmcnt(44)
-; GFX11-NEXT:    v_lshlrev_b16 v58, 8, v59
-; GFX11-NEXT:    s_waitcnt vmcnt(43)
-; GFX11-NEXT:    v_lshlrev_b16 v44, 8, v60
-; GFX11-NEXT:    s_waitcnt vmcnt(42)
-; GFX11-NEXT:    v_lshlrev_b16 v60, 8, v61
-; GFX11-NEXT:    s_waitcnt vmcnt(41)
-; GFX11-NEXT:    v_lshlrev_b16 v59, 8, v62
-; GFX11-NEXT:    s_waitcnt vmcnt(40)
-; GFX11-NEXT:    v_lshlrev_b16 v62, 8, v63
-; GFX11-NEXT:    s_waitcnt vmcnt(39)
-; GFX11-NEXT:    v_lshlrev_b16 v47, 8, v72
-; GFX11-NEXT:    s_waitcnt vmcnt(38)
-; GFX11-NEXT:    v_lshlrev_b16 v72, 8, v73
-; GFX11-NEXT:    s_waitcnt vmcnt(37)
-; GFX11-NEXT:    v_lshlrev_b16 v63, 8, v74
-; GFX11-NEXT:    s_waitcnt vmcnt(36)
-; GFX11-NEXT:    v_lshlrev_b16 v74, 8, v75
-; GFX11-NEXT:    s_waitcnt vmcnt(35)
-; GFX11-NEXT:    v_lshlrev_b16 v73, 8, v76
-; GFX11-NEXT:    s_waitcnt vmcnt(34)
-; GFX11-NEXT:    v_lshlrev_b16 v75, 8, v77
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v61, 8, v78
-; GFX11-NEXT:    s_waitcnt vmcnt(32)
-; GFX11-NEXT:    v_lshlrev_b16 v78, 8, v79
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v77, 8, v89
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b16 v79, 8, v90
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v76, 8, v95
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b16 v90, 8, v104
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v89, 8, v105
-; GFX11-NEXT:    v_lshlrev_b16 v104, 8, v94
-; GFX11-NEXT:    v_lshlrev_b16 v95, 8, v31
-; GFX11-NEXT:    v_lshlrev_b16 v105, 8, v30
-; GFX11-NEXT:    v_lshlrev_b16 v94, 8, v28
-; GFX11-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v48
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v70
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v71
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v84
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v82
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v51
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v34
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v52
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v115
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v66
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v128
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v113
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v132
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v100
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v161
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v160
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v176
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v167
-; GFX11-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v5, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v11, v10, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v37
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v102
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v114
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v96
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v133
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v117
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v135
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v130
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v181
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v150
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v41
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v40
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v43
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v182
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v46
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v45
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v57
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v56
-; GFX11-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v14, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v16, v15, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v147
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v119
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v149
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v144
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v162
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v146
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v178
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v164
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v151
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v148
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v58
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v44
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v60
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v59
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v62
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v47
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v72
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v63
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v74
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v73
-; GFX11-NEXT:    v_perm_b32 v12, v13, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v15, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v17, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v19, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v16, v21, v20, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v166
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v145
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v177
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v163
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v179
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v165
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v183
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v180
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v42
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v65
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v75
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v61
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v78
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v77
-; GFX11-NEXT:    v_or_b32_e32 v21, v21, v79
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v76
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v90
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v89
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v92
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v91
-; GFX11-NEXT:    v_perm_b32 v17, v18, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v18, v20, v19, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v19, v22, v21, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v20, v24, v23, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v21, v26, v25, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v83
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v101
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v22, v22, v93
-; GFX11-NEXT:    v_or_b32_e32 v23, v23, v88
-; GFX11-NEXT:    v_or_b32_e32 v24, v24, v104
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v95
-; GFX11-NEXT:    v_or_b32_e32 v26, v26, v105
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v94
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v108
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v107
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v110
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v109
-; GFX11-NEXT:    v_perm_b32 v22, v23, v22, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v23, v25, v24, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v24, v27, v26, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v25, v29, v28, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v26, v31, v30, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xff, v103
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xff, v112
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v99
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xff, v129
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v98
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v131
-; GFX11-NEXT:    v_and_b32_e32 v34, 0xff, v116
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v134
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v118
-; GFX11-NEXT:    v_or_b32_e32 v27, v27, v111
-; GFX11-NEXT:    v_or_b32_e32 v28, v28, v106
-; GFX11-NEXT:    v_or_b32_e32 v29, v29, v122
-; GFX11-NEXT:    v_or_b32_e32 v30, v30, v121
-; GFX11-NEXT:    v_or_b32_e32 v31, v31, v123
-; GFX11-NEXT:    v_or_b32_e32 v32, v32, v120
-; GFX11-NEXT:    v_or_b32_e32 v33, v33, v125
-; GFX11-NEXT:    v_or_b32_e32 v34, v34, v124
-; GFX11-NEXT:    v_or_b32_e32 v35, v35, v126
-; GFX11-NEXT:    v_or_b32_e32 v36, v36, v127
-; GFX11-NEXT:    v_perm_b32 v27, v28, v27, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v28, v30, v29, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v29, v32, v31, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v30, v34, v33, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v31, v36, v35, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr133
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr135
-; GFX11-NEXT:    ; implicit-def: $vgpr130
-; GFX11-NEXT:    ; implicit-def: $vgpr147
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr149
-; GFX11-NEXT:    ; implicit-def: $vgpr144
-; GFX11-NEXT:    ; implicit-def: $vgpr162
-; GFX11-NEXT:    ; implicit-def: $vgpr146
-; GFX11-NEXT:    ; implicit-def: $vgpr178
-; GFX11-NEXT:    ; implicit-def: $vgpr164
-; GFX11-NEXT:    ; implicit-def: $vgpr151
-; GFX11-NEXT:    ; implicit-def: $vgpr148
-; GFX11-NEXT:    ; implicit-def: $vgpr166
-; GFX11-NEXT:    ; implicit-def: $vgpr145
-; GFX11-NEXT:    ; implicit-def: $vgpr177
-; GFX11-NEXT:    ; implicit-def: $vgpr163
-; GFX11-NEXT:    ; implicit-def: $vgpr179
-; GFX11-NEXT:    ; implicit-def: $vgpr165
-; GFX11-NEXT:    ; implicit-def: $vgpr183
-; GFX11-NEXT:    ; implicit-def: $vgpr180
-; GFX11-NEXT:    ; implicit-def: $vgpr42
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr129
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr131
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr134
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr128
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr132
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr161
-; GFX11-NEXT:    ; implicit-def: $vgpr160
-; GFX11-NEXT:    ; implicit-def: $vgpr176
-; GFX11-NEXT:    ; implicit-def: $vgpr167
-; GFX11-NEXT:    ; implicit-def: $vgpr181
-; GFX11-NEXT:    ; implicit-def: $vgpr150
-; GFX11-NEXT:    ; implicit-def: $vgpr41
-; GFX11-NEXT:    ; implicit-def: $vgpr40
-; GFX11-NEXT:    ; implicit-def: $vgpr43
-; GFX11-NEXT:    ; implicit-def: $vgpr182
-; GFX11-NEXT:    ; implicit-def: $vgpr46
-; GFX11-NEXT:    ; implicit-def: $vgpr45
-; GFX11-NEXT:    ; implicit-def: $vgpr57
-; GFX11-NEXT:    ; implicit-def: $vgpr56
-; GFX11-NEXT:    ; implicit-def: $vgpr58
-; GFX11-NEXT:    ; implicit-def: $vgpr44
-; GFX11-NEXT:    ; implicit-def: $vgpr60
-; GFX11-NEXT:    ; implicit-def: $vgpr59
-; GFX11-NEXT:    ; implicit-def: $vgpr62
-; GFX11-NEXT:    ; implicit-def: $vgpr47
-; GFX11-NEXT:    ; implicit-def: $vgpr72
-; GFX11-NEXT:    ; implicit-def: $vgpr63
-; GFX11-NEXT:    ; implicit-def: $vgpr74
-; GFX11-NEXT:    ; implicit-def: $vgpr73
-; GFX11-NEXT:    ; implicit-def: $vgpr75
-; GFX11-NEXT:    ; implicit-def: $vgpr61
-; GFX11-NEXT:    ; implicit-def: $vgpr78
-; GFX11-NEXT:    ; implicit-def: $vgpr77
-; GFX11-NEXT:    ; implicit-def: $vgpr79
-; GFX11-NEXT:    ; implicit-def: $vgpr76
-; GFX11-NEXT:    ; implicit-def: $vgpr90
-; GFX11-NEXT:    ; implicit-def: $vgpr89
-; GFX11-NEXT:    ; implicit-def: $vgpr92
-; GFX11-NEXT:    ; implicit-def: $vgpr91
-; GFX11-NEXT:    ; implicit-def: $vgpr93
-; GFX11-NEXT:    ; implicit-def: $vgpr88
-; GFX11-NEXT:    ; implicit-def: $vgpr104
-; GFX11-NEXT:    ; implicit-def: $vgpr95
-; GFX11-NEXT:    ; implicit-def: $vgpr105
-; GFX11-NEXT:    ; implicit-def: $vgpr94
-; GFX11-NEXT:    ; implicit-def: $vgpr108
-; GFX11-NEXT:    ; implicit-def: $vgpr107
-; GFX11-NEXT:    ; implicit-def: $vgpr110
-; GFX11-NEXT:    ; implicit-def: $vgpr109
-; GFX11-NEXT:    ; implicit-def: $vgpr111
-; GFX11-NEXT:    ; implicit-def: $vgpr106
-; GFX11-NEXT:    ; implicit-def: $vgpr122
-; GFX11-NEXT:    ; implicit-def: $vgpr121
-; GFX11-NEXT:    ; implicit-def: $vgpr123
-; GFX11-NEXT:    ; implicit-def: $vgpr120
-; GFX11-NEXT:    ; implicit-def: $vgpr125
-; GFX11-NEXT:    ; implicit-def: $vgpr124
-; GFX11-NEXT:    ; implicit-def: $vgpr126
-; GFX11-NEXT:    ; implicit-def: $vgpr127
-; GFX11-NEXT:  .LBB48_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v134, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v118, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v131, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v116, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v129, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v35, v35, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v126, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v127, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v125, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v124, v3
-; GFX11-NEXT:    v_add_nc_u16 v33, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v31, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v98, 3
-; GFX11-NEXT:    v_add_nc_u16 v116, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v30, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v98, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v1, v112, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u16 v3, v99, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v103, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v123, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v120, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v122, v1
-; GFX11-NEXT:    v_add_nc_u16 v29, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v121, v3
-; GFX11-NEXT:    v_add_nc_u16 v99, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v111, v4
-; GFX11-NEXT:    v_add_nc_u16 v28, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v81, 3
-; GFX11-NEXT:    v_add_nc_u16 v81, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v101, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v86, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v97, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v83, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v106, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v110, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v109, v0
-; GFX11-NEXT:    v_add_nc_u16 v83, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v108, v3
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v107, v4
-; GFX11-NEXT:    v_add_nc_u16 v86, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v67, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v80, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v68, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v69, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v105, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v94, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v104, v2
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v95, v3
-; GFX11-NEXT:    v_add_nc_u16 v68, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v93, v4
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v64, 3
-; GFX11-NEXT:    v_add_nc_u16 v64, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_add_nc_u16 v0, v42, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v65, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_add_nc_u16 v3, v183, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v180, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v88, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v92, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v91, v1
-; GFX11-NEXT:    v_add_nc_u16 v65, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v90, v3
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v89, v4
-; GFX11-NEXT:    v_add_nc_u16 v69, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_add_nc_u16 v1, v179, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v165, 3
-; GFX11-NEXT:    v_add_nc_u16 v80, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_add_nc_u16 v0, v177, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v163, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_add_nc_u16 v4, v166, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v79, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v76, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v78, v0
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v77, v3
-; GFX11-NEXT:    v_add_nc_u16 v85, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v75, v4
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v145, 3
-; GFX11-NEXT:    v_add_nc_u16 v97, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_add_nc_u16 v1, v151, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v148, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_add_nc_u16 v3, v178, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v164, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v61, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v74, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v73, v2
-; GFX11-NEXT:    v_add_nc_u16 v101, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v72, v3
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v63, v4
-; GFX11-NEXT:    v_add_nc_u16 v103, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_add_nc_u16 v2, v162, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v146, 3
-; GFX11-NEXT:    v_add_nc_u16 v112, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_add_nc_u16 v1, v149, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v144, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u16 v4, v147, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v62, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v47, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v60, v1
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v59, v3
-; GFX11-NEXT:    v_add_nc_u16 v118, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v58, v4
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v119, 3
-; GFX11-NEXT:    v_add_nc_u16 v119, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_nc_u16 v2, v135, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v130, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_nc_u16 v3, v133, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v117, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v44, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v57, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v56, v0
-; GFX11-NEXT:    v_add_nc_u16 v117, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v46, v3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v45, v4
-; GFX11-NEXT:    v_add_nc_u16 v129, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v0, v114, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v96, 3
-; GFX11-NEXT:    v_add_nc_u16 v96, 0x300, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v2, v102, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v87, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v55, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v43, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v182, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v41, v2
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v40, v3
-; GFX11-NEXT:    v_add_nc_u16 v55, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v181, v4
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v37, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v52, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v53, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v50, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v150, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v176, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v167, v1
-; GFX11-NEXT:    v_add_nc_u16 v50, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v161, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v160, v4
-; GFX11-NEXT:    v_add_nc_u16 v52, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v51, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v49, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v48, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v36, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v132, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v128, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v115, v0
-; GFX11-NEXT:    v_add_nc_u16 v34, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v100, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v113, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v39, 3
-; GFX11-NEXT:    v_add_nc_u16 v36, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v32, v32, 3
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v32, 0xff, v32
-; GFX11-NEXT:    v_or_b32_e32 v35, v71, v35
-; GFX11-NEXT:    v_or_b32_e32 v33, v82, v33
-; GFX11-NEXT:    v_or_b32_e32 v0, v84, v0
-; GFX11-NEXT:    v_or_b32_e32 v36, v70, v36
-; GFX11-NEXT:    v_or_b32_e32 v32, v66, v32
-; GFX11-NEXT:    v_add_nc_u16 v35, 0x300, v35
-; GFX11-NEXT:    v_add_nc_u16 v33, 0x300, v33
-; GFX11-NEXT:    v_add_nc_u16 v38, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v36
-; GFX11-NEXT:    v_add_nc_u16 v32, 0x300, v32
-; GFX11-NEXT:    v_add_nc_u16 v36, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v39, 0x300, v1
-; GFX11-NEXT:    v_perm_b32 v1, v33, v38, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v35, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v32, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v39, v34, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v52, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v50, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v37, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v55, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v96, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v129, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v117, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v119, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v118, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v112, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v16, v103, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v17, v101, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v18, v97, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v19, v85, v19, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v20, v80, v20, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v21, v69, v21, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v22, v65, v22, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v23, v64, v23, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v24, v68, v24, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v25, v67, v25, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v26, v86, v26, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v27, v83, v27, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v28, v81, v28, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v29, v99, v29, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v30, v98, v30, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
-; GFX11-NEXT:  .LBB48_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v127, off, s32 offset:392
-; GFX11-NEXT:    scratch_load_b32 v126, off, s32 offset:396
-; GFX11-NEXT:    scratch_load_b32 v125, off, s32 offset:400
-; GFX11-NEXT:    scratch_load_b32 v124, off, s32 offset:404
-; GFX11-NEXT:    scratch_load_b32 v123, off, s32 offset:408
-; GFX11-NEXT:    scratch_load_b32 v122, off, s32 offset:412
-; GFX11-NEXT:    scratch_load_b32 v121, off, s32 offset:416
-; GFX11-NEXT:    scratch_load_b32 v120, off, s32 offset:420
-; GFX11-NEXT:    scratch_load_b32 v111, off, s32 offset:424
-; GFX11-NEXT:    scratch_load_b32 v110, off, s32 offset:428
-; GFX11-NEXT:    scratch_load_b32 v109, off, s32 offset:432
-; GFX11-NEXT:    scratch_load_b32 v108, off, s32 offset:436
-; GFX11-NEXT:    scratch_load_b32 v107, off, s32 offset:440
-; GFX11-NEXT:    scratch_load_b32 v106, off, s32 offset:444
-; GFX11-NEXT:    scratch_load_b32 v105, off, s32 offset:448
-; GFX11-NEXT:    scratch_load_b32 v104, off, s32 offset:452
-; GFX11-NEXT:    scratch_load_b32 v95, off, s32 offset:456
-; GFX11-NEXT:    scratch_load_b32 v94, off, s32 offset:460
-; GFX11-NEXT:    scratch_load_b32 v93, off, s32 offset:464
-; GFX11-NEXT:    scratch_load_b32 v92, off, s32 offset:468
-; GFX11-NEXT:    scratch_load_b32 v91, off, s32 offset:472
-; GFX11-NEXT:    scratch_load_b32 v90, off, s32 offset:476
-; GFX11-NEXT:    scratch_load_b32 v89, off, s32 offset:480
-; GFX11-NEXT:    scratch_load_b32 v88, off, s32 offset:484
-; GFX11-NEXT:    scratch_load_b32 v79, off, s32 offset:488
-; GFX11-NEXT:    scratch_load_b32 v78, off, s32 offset:492
-; GFX11-NEXT:    scratch_load_b32 v77, off, s32 offset:496
-; GFX11-NEXT:    scratch_load_b32 v76, off, s32 offset:500
-; GFX11-NEXT:    scratch_load_b32 v75, off, s32 offset:504
-; GFX11-NEXT:    scratch_load_b32 v74, off, s32 offset:508
-; GFX11-NEXT:    scratch_load_b32 v73, off, s32 offset:512
-; GFX11-NEXT:    scratch_load_b32 v72, off, s32 offset:516
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:520
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:524
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:528
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:532
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:536
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:540
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:544
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:548
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:552
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:556
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:560
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:564
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:568
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:572
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:576
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:580
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v128i8_to_v64i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:384
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32 offset:380
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:376
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:372
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v150, off, s32 offset:368
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v39, off, s32 offset:364
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v150, off, s32 offset:360
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:356
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v147, off, s32 offset:352
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:348
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v149, off, s32 offset:344
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v49, off, s32 offset:340
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v149, off, s32 offset:336
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:332
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v148, off, s32 offset:328
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:324
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v145, off, s32 offset:320
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:316
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v148, off, s32 offset:312
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:308
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v147, off, s32 offset:304
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:300
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v146, off, s32 offset:296
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:292
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v146, off, s32 offset:288
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:284
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v145, off, s32 offset:280
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:276
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v134, off, s32 offset:272
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:268
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v144, off, s32 offset:264
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:260
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v144, off, s32 offset:256
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:252
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v135, off, s32 offset:248
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v132, off, s32 offset:240
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v135, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v134, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v133, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v160, off, s32 offset:388
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v101, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v103, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v113, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v113, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v114, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v114, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v115, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v116, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v116, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v117, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v117, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v118, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v118, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v119, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v119, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v128, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v128, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v129, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v129, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v130, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v130, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v131, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v131, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v132, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v133, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v151, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v151, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v100, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v98, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v100, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v85, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v97, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v86, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v96, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v87, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v80, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v84, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v86, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v96, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v85, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v82, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v84, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v83, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.l, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v81.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v82.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v83.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v97.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v70.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v98.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v99.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v87.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v102.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.h, 8, v150.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v150.l, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.h, 8, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.h, 8, v149.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v149.l, 8, v149.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.h, 8, v148.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.l, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v148.l, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v147.l, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.h, 8, v146.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v146.l, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v145.h, 8, v145.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.h, 8, v134.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.h, 8, v144.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v144.l, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.h, 8, v135.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.l, 8, v132.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v135.l, 8, v135.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v134.l, 8, v134.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.h, 8, v133.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v160
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v101.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v112.h, 8, v103.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.l, 8, v113.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v113.h, 8, v113.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v103.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.l, 8, v114.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v115.h, 8, v115.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.l, 8, v116.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v116.h, 8, v116.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.l, 8, v117.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.l, 8, v117.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.l, 8, v118.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v118.h, 8, v118.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.l, 8, v119.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v114.h, 8, v119.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v119.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.l, 8, v128.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v128.h, 8, v129.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.l, 8, v129.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.l, 8, v130.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v117.h, 8, v130.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v130.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.l, 8, v131.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v131.h, 8, v132.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v129.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v132.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v133.l, 8, v151.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.l, 8, v31.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v151.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB48_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB48_4
+; GFX11-TRUE16-NEXT:  .LBB48_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB48_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v51.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v65.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v67.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v54.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v66.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v68.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v71.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v85.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v84.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v80.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.h, 0xff, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v97.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v100.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v98.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v100.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v48.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v50.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v81.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v81.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v82.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v97.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v70.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v98.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v99.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v99.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v87.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v101.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v102.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v102.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v112.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v101.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v112.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v113.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v113.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v103.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v115.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v115.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v116.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v116.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v117.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v114.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v118.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v118.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v119.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v114.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v119.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v128.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v128.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v16.h, v129.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v17.l, v130.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v117.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v18.l, v130.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v131.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v19.l, v131.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v129.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v20.l, v132.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v133.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.l, v21.l, v133.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v134.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v22.l, v135.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v132.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v23.l, v135.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v144.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v24.l, v144.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v134.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.l, v25.l, v145.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v146.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.l, v26.l, v146.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v147.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.l, v27.l, v148.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v145.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v28.l, v148.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v149.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.l, v29.l, v149.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v29.h, v147.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.l, v30.l, v150.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v30.h, v150.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v31.l, v151.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v31.h, v151.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-TRUE16-NEXT:  .LBB48_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v50.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v39.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v39.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v48.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v151.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v151.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v150.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v150.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v148.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v31.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v30.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v48.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v149.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v147.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v148.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v149.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v29.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v28.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v146.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v145.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v146.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v147.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v145.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v27.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v26.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v25.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v135.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v144.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v134.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v135.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v144.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v24.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v23.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v100.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v100.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v98.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v132.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v133.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v134.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v132.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v133.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v22.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v21.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v97.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v85.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v96.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v86.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v20.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v87.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v131.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v129.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v130.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v131.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v19.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v18.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v71.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v84.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v80.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v96.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v130.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v117.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v128.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v129.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v119.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v86.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v17.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v16.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v85.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v80.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v84.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v82.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v128.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v119.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v114.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v118.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v118.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v83.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v69.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v71.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v69.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v70.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v117.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v114.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v116.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v116.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v115.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v68.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v68.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v66.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v67.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v66.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v115.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v113.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v103.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v112.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v113.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v67.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v54.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v65.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v64.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v65.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v112.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v101.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v102.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v103.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v101.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v64.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v52.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v102.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v99.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v87.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v98.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v99.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v54.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v49.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v51.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v52.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v51.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v97.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v70.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v82.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.l, v83.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v81.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v33.l, v81.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v32.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v32.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v33.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v128i8_to_v64i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s32 offset:580
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v45, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v46, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v47, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v56, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v57, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v58, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v59, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v60, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v61, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v62, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v63, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v72, s32 offset:516
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v73, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v74, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v75, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v76, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v77, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v78, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v79, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v88, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v89, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v90, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v91, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v92, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v93, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v94, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v95, s32 offset:456
+; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v104, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v105, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v106, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v107, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v108, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v109, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v110, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v111, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v120, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v121, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v122, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v123, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v124, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v125, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v126, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v127, s32 offset:392
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:384
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v118, off, s32 offset:380
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:376
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v134, off, s32 offset:372
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:368
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:364
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:360
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:356
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:352
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v98, off, s32 offset:348
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:344
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:340
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v12, off, s32 offset:336
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:332
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32 offset:328
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v112, off, s32 offset:324
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v16, off, s32 offset:320
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v81, off, s32 offset:316
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v18, off, s32 offset:312
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:308
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v20, off, s32 offset:304
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:300
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v22, off, s32 offset:296
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:292
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v24, off, s32 offset:288
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:284
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v26, off, s32 offset:280
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:276
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v28, off, s32 offset:272
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:268
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v30, off, s32 offset:264
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:260
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v31, off, s32 offset:256
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:252
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v94, off, s32 offset:248
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:244
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v88, off, s32 offset:240
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:236
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v93, off, s32 offset:232
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:228
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v91, off, s32 offset:224
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:220
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v92, off, s32 offset:216
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v150, off, s32 offset:388
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v182, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v40, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v43, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v44, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v45, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v46, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v47, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v56, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v58, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v59, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v60, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v61, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v62, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v63, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v72, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v73, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v74, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v75, off, s32 offset:136
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v76, off, s32 offset:144
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v77, off, s32 offset:152
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v78, off, s32 offset:160
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v79, off, s32 offset:168
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v89, off, s32 offset:176
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v90, off, s32 offset:184
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v95, off, s32 offset:192
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v104, off, s32 offset:200
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v105, off, s32 offset:208
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v42, off, s32 offset:212
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v180, off, s32 offset:204
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v183, off, s32 offset:196
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v165, off, s32 offset:188
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v179, off, s32 offset:180
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v163, off, s32 offset:172
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v177, off, s32 offset:164
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v145, off, s32 offset:156
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v166, off, s32 offset:148
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v148, off, s32 offset:140
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v151, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v164, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v178, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v146, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v162, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v144, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v149, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v119, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v147, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v135, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v133, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v70, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v128, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v132, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v161, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v160, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v176, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v167, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v181, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(62)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v127, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v126, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v124, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v125, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v120, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v123, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v121, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v122, 8, v14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v106, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v111, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v109, 8, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v110, 8, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v107, 8, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v108, 8, v26
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(61)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v88, 8, v88
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(59)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v93, 8, v93
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(57)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v91, 8, v91
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(55)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v92, 8, v92
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(54)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v150
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(53)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v150, 8, v182
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(52)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v41, 8, v40
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(51)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v40, 8, v43
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(50)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v43, 8, v44
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(49)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v182, 8, v45
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(48)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v46, 8, v46
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(47)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v45, 8, v47
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(46)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v57, 8, v56
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(45)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v56, 8, v58
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(44)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v58, 8, v59
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(43)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v44, 8, v60
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(42)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v60, 8, v61
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(41)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v59, 8, v62
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(40)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v62, 8, v63
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(39)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v47, 8, v72
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(38)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v72, 8, v73
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(37)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v63, 8, v74
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(36)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v74, 8, v75
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(35)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v73, 8, v76
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(34)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v75, 8, v77
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v61, 8, v78
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v78, 8, v79
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v77, 8, v89
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v79, 8, v90
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v76, 8, v95
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v90, 8, v104
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v89, 8, v105
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v104, 8, v94
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v95, 8, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v105, 8, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v94, 8, v28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v70
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v128
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v132
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v161
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v160
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v176
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v167
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v5, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v9, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v37
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v102
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v114
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v133
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v117
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v135
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v130
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v181
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v150
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v41
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v40
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v43
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v182
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v46
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v45
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v57
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v56
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v14, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v16, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v147
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v149
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v144
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v162
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v146
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v178
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v164
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v151
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v148
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v58
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v44
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v60
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v59
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v62
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v47
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v72
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v63
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v74
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v73
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v13, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v15, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v17, v16, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v19, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v21, v20, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v166
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v145
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v177
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v163
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v179
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v165
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v183
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v180
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v42
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v75
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v61
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v78
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v77
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v21, v79
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v76
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v90
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v89
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v92
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v91
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v22, v21, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v24, v23, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v26, v25, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v22, v93
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v23, v88
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v24, v104
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v95
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v26, v105
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v94
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v108
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v107
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v110
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v109
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v23, v22, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v25, v24, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v27, v26, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v29, v28, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v31, v30, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xff, v103
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xff, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xff, v129
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v98
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v131
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v34, 0xff, v116
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v134
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, v27, v111
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, v28, v106
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v29, v29, v122
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, v30, v121
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, v31, v123
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v32, v120
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v33, v125
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, v34, v124
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v35, v126
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v36, v127
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v28, v27, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v30, v29, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v32, v31, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v34, v33, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v36, v35, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr133
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr135
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr130
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr147
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr149
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr144
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr162
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr146
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr178
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr164
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr151
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr148
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr166
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr145
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr177
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr163
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr179
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr165
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr183
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr180
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr42
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr129
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr131
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr134
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr128
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr132
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr161
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr160
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr176
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr167
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr181
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr150
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr41
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr40
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr43
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr182
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr46
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr45
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr57
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr56
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr58
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr44
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr60
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr59
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr62
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr47
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr72
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr63
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr74
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr73
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr75
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr61
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr78
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr77
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr79
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr76
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr90
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr89
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr92
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr91
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr93
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr88
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr104
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr95
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr105
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr94
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr108
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr107
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr110
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr109
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr111
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr106
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr122
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr121
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr123
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr120
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr125
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr124
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr126
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr127
+; GFX11-FAKE16-NEXT:  .LBB48_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v134, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v118, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v131, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v116, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v129, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, v35, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v126, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v127, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v125, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v124, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v31, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v98, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v116, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v30, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v98, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v112, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v99, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v103, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v123, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v120, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v122, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v29, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v121, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v99, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v111, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v28, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v81, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v81, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v101, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v97, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v83, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v106, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v110, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v109, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v83, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v108, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v107, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v86, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v80, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v68, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v69, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v105, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v94, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v104, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v95, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v68, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v93, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v64, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v64, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v42, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v65, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v183, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v180, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v88, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v92, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v91, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v65, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v90, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v89, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v69, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v179, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v165, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v80, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v177, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v163, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v166, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v79, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v76, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v78, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v77, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v85, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v75, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v145, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v97, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v151, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v148, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v178, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v164, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v61, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v74, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v73, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v101, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v72, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v63, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v103, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v162, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v146, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v112, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v149, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v144, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v147, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v62, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v47, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v60, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v59, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v118, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v58, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v119, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v119, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v135, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v130, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v133, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v117, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v44, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v57, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v56, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v117, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v46, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v45, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v129, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v114, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v96, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v96, 0x300, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v102, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v55, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v43, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v182, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v41, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v40, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v55, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v181, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v37, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v52, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v53, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v50, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v150, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v176, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v167, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v50, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v161, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v160, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v52, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v51, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v49, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v48, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v36, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v132, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v128, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v115, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v34, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v100, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v113, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v39, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, v32, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v32, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, v71, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, v82, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v84, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, v70, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v66, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v35, 0x300, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v33, 0x300, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v38, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v32, 0x300, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v36, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v39, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v33, v38, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v35, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v32, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v36, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v39, v34, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v52, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v50, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v37, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v55, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v96, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v129, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v117, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v119, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v118, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v112, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v103, v16, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v101, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v97, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v85, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v80, v20, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v69, v21, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v65, v22, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v64, v23, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v68, v24, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v67, v25, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v86, v26, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v83, v27, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v81, v28, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v99, v29, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v98, v30, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v116, v31, 0x5040100
+; GFX11-FAKE16-NEXT:  .LBB48_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v127, off, s32 offset:392
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v126, off, s32 offset:396
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v125, off, s32 offset:400
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v124, off, s32 offset:404
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v123, off, s32 offset:408
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v122, off, s32 offset:412
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v121, off, s32 offset:416
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v120, off, s32 offset:420
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v111, off, s32 offset:424
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v110, off, s32 offset:428
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v109, off, s32 offset:432
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v108, off, s32 offset:436
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v107, off, s32 offset:440
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v106, off, s32 offset:444
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v105, off, s32 offset:448
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v104, off, s32 offset:452
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v95, off, s32 offset:456
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v94, off, s32 offset:460
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v93, off, s32 offset:464
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v92, off, s32 offset:468
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v91, off, s32 offset:472
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v90, off, s32 offset:476
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v89, off, s32 offset:480
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v88, off, s32 offset:484
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v79, off, s32 offset:488
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v78, off, s32 offset:492
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v77, off, s32 offset:496
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v76, off, s32 offset:500
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v75, off, s32 offset:504
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v74, off, s32 offset:508
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v73, off, s32 offset:512
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v72, off, s32 offset:516
+; GFX11-FAKE16-NEXT:    s_clause 0xf
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v63, off, s32 offset:520
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v62, off, s32 offset:524
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v61, off, s32 offset:528
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v60, off, s32 offset:532
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v59, off, s32 offset:536
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v58, off, s32 offset:540
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v57, off, s32 offset:544
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v56, off, s32 offset:548
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v47, off, s32 offset:552
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v46, off, s32 offset:556
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v45, off, s32 offset:560
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s32 offset:564
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s32 offset:568
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s32 offset:572
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s32 offset:576
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s32 offset:580
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -95300,532 +107703,1071 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v64f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v32
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB49_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-NEXT:    v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v32
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v10
-; GFX11-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v16
-; GFX11-NEXT:    v_bfe_u32 v16, v32, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v87, 0x40c00000, v87
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v16, v16, v32, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v38, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
-; GFX11-NEXT:    v_bfe_u32 v17, v34, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v11
-; GFX11-NEXT:    v_add3_u32 v17, v17, v34, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v17, v39, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v33
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add3_u32 v33, v34, v36, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v37, v17, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v18
-; GFX11-NEXT:    v_add3_u32 v34, v37, v35, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_perm_b32 v17, v33, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v16, v32, v16, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v34, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v34, v37, v38, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v36, v39, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v19
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_bfe_u32 v38, v37, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v36, v39, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v39
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v20
-; GFX11-NEXT:    v_perm_b32 v18, v34, v18, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v38, v37, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v48, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v48
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v36, v38, v48, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v36, v38, v39, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v39
-; GFX11-NEXT:    v_bfe_u32 v38, v49, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_bfe_u32 v39, v48, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v50, 0x40c00000, v21
-; GFX11-NEXT:    v_perm_b32 v19, v35, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v37, v38, v49, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v49
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-NEXT:    v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_perm_b32 v20, v36, v20, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v37, v38, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v48
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_bfe_u32 v48, v49, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v51, 0x40c00000, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v39, v50, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_perm_b32 v21, v37, v21, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v38, v39, v50, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v50
-; GFX11-NEXT:    v_add_f32_e32 v50, 0x40c00000, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v38, v39, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v38, v48, v49, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v49
-; GFX11-NEXT:    v_bfe_u32 v48, v51, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX11-NEXT:    v_bfe_u32 v49, v50, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v39, v48, v51, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-NEXT:    v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v39, v48, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v39, v49, v50, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v50
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_bfe_u32 v50, v51, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v53, 0x40c00000, v24
-; GFX11-NEXT:    v_perm_b32 v22, v38, v22, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v49, v52, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v23, v39, v23, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v48, v49, v52, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v52
-; GFX11-NEXT:    v_add_f32_e32 v52, 0x40c00000, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v48, v49, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v48, v50, v51, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v51
-; GFX11-NEXT:    v_bfe_u32 v50, v53, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-NEXT:    v_bfe_u32 v51, v52, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v49, v50, v53, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v53
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-NEXT:    v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v49, v50, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v49, v51, v52, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v52
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    v_bfe_u32 v52, v53, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v55, 0x40c00000, v26
-; GFX11-NEXT:    v_perm_b32 v24, v48, v24, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v51, v54, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v25, v49, v25, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v50, v51, v54, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v51, 0x400000, v54
-; GFX11-NEXT:    v_add_f32_e32 v54, 0x40c00000, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v50, v51, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v51, 0x400000, v53
-; GFX11-NEXT:    v_bfe_u32 v52, v55, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-NEXT:    v_bfe_u32 v53, v54, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v51, v52, v55, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v55
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-NEXT:    v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v51, v52, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v51, v53, v54, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v54
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-NEXT:    v_bfe_u32 v54, v55, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v65, 0x40c00000, v28
-; GFX11-NEXT:    v_perm_b32 v26, v50, v26, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v53, v64, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v27, v51, v27, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v52, v53, v64, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v64
-; GFX11-NEXT:    v_add_f32_e32 v64, 0x40c00000, v29
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v52, v53, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v52, v54, v55, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v55
-; GFX11-NEXT:    v_bfe_u32 v54, v65, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX11-NEXT:    v_bfe_u32 v55, v64, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v53, v54, v65, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v65
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-NEXT:    v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v53, v54, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v53, v55, v64, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v64
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-NEXT:    v_bfe_u32 v64, v65, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v67, 0x40c00000, v30
-; GFX11-NEXT:    v_perm_b32 v28, v52, v28, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v55, v66, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v29, v53, v29, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v54, v55, v66, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v66
-; GFX11-NEXT:    v_add_f32_e32 v66, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v54, v55, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v54, v64, v65, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v65
-; GFX11-NEXT:    v_bfe_u32 v64, v67, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v65, v66, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v55, v64, v67, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v67
-; GFX11-NEXT:    v_add_f32_e32 v68, 0x40c00000, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1
-; GFX11-NEXT:    v_perm_b32 v30, v54, v30, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v55, v64, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v67, 0x40c00000, v67
-; GFX11-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v66
-; GFX11-NEXT:    v_bfe_u32 v65, v68, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-NEXT:    v_bfe_u32 v66, v0, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v55, v55, v64, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v64, v65, v68, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v65, 0x400000, v68
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-NEXT:    v_bfe_u32 v68, v67, 16, 1
-; GFX11-NEXT:    v_perm_b32 v31, v55, v31, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v64, v64, v65, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v65, v66, v0, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_add3_u32 v65, v68, v67, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v67
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-NEXT:    v_perm_b32 v0, v0, v64, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v68, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v67, 0x400000, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v65, v65, v66, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v66, v68, v1, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v68, v69, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v66, v67, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v66, v68, v69, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v67, 0x400000, v69
-; GFX11-NEXT:    v_bfe_u32 v68, v2, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
-; GFX11-NEXT:    v_perm_b32 v1, v1, v65, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v66, v66, v67, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v67, v68, v2, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_perm_b32 v2, v2, v66, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v69, v70, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v70
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-NEXT:    v_add3_u32 v67, v69, v70, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v69, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v68, v69, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v69, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v67, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v70, v71, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v69, 0x400000, v71
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v68, v70, v71, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v70, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v68, v68, v69, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v69, v70, v4, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_perm_b32 v4, v4, v68, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v71, v80, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v80
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
-; GFX11-NEXT:    v_add3_u32 v69, v71, v80, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v71, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v69, v69, v70, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v70, v71, v5, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v71, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v5, v5, v69, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v80, v81, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v71, 0x400000, v81
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v70, v80, v81, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v80, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v70, v70, v71, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v71, v80, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_perm_b32 v6, v6, v70, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v81, v82, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v82
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT:    v_add3_u32 v71, v81, v82, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v81, v7, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v71, v71, v80, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v80, v81, v7, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v81, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v71, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v82, v83, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v81, 0x400000, v83
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v80, v82, v83, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v82, v8, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v80, v80, v81, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v81, v82, v8, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_perm_b32 v8, v8, v80, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v83, v84, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v84
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-NEXT:    v_add3_u32 v81, v83, v84, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v83, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v81, v81, v82, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v82, v83, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v83, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v9, v9, v81, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v84, v85, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v83, 0x400000, v85
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v82, v84, v85, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v84, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v82, v82, v83, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v83, v84, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_perm_b32 v10, v10, v82, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v85, v86, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v86
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v11
-; GFX11-NEXT:    v_add3_u32 v83, v85, v86, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v85, v11, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13
-; GFX11-NEXT:    v_bfe_u32 v86, v87, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_or_b32_e32 v97, 0x400000, v87
-; GFX11-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
-; GFX11-NEXT:    v_add3_u32 v86, v86, v87, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_bfe_u32 v98, v12, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v84, 16, 1
-; GFX11-NEXT:    v_add3_u32 v85, v85, v11, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v86, v86, v97, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v97, 0x400000, v12
-; GFX11-NEXT:    v_add3_u32 v87, v98, v12, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v98, v99, v84, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v99, 16, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v84
-; GFX11-NEXT:    v_bfe_u32 v101, v13, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v97, v101, v13, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v12, v12, v86, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v99, v87, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v84, v98, v100, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v101, 0x400000, v87
-; GFX11-NEXT:    v_bfe_u32 v102, v14, 16, 1
-; GFX11-NEXT:    v_add3_u32 v99, v99, v87, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101
-; GFX11-NEXT:    v_add3_u32 v101, v102, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v103, v98, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v98
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_add3_u32 v103, v103, v98, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v14, v14, v87, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v99, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v113, 0x400000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v98, v103, v112, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v99, v99, v15, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v99, v113, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v98, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v97, v100, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_perm_b32 v13, v13, v84, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v85, v96, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v11, v11, v83, 0x7060302
-; GFX11-NEXT:  .LBB49_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v81, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v71, 0x40c00000, v71
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v96, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v33
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v16, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v39 :: v_dual_lshlrev_b32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v34, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v65, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v32, v37, v32 :: v_dual_lshlrev_b32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v67, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v31, 16, v31
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v37, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v39
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v80, 0x40c00000, v80
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v36, v38, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v38, 0x40c00000, v48 :: v_dual_add_f32 v39, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v19, v34, v35
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v36, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v39, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v82, 0x40c00000, v82
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_lshlrev_b32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v36, v39, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v39
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v49
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v49, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v37, v38, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v39, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v37, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v86, 0x40c00000, v86
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v38, v39, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v39
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.l, v36.h
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v87, 0x40c00000, v87 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v96, 0x40c00000, v96 :: v_dual_cndmask_b32 v21, v37, v38
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v48, v49, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v49
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v22, v37, v38 :: v_dual_add_f32 v49, 0x40c00000, v51
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v50, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v50
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v51, 0x40c00000, v23 :: v_dual_add_f32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v39, v50, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v98, 0x40c00000, v98
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v101, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v39, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-TRUE16-NEXT:    v_add3_u32 v101, v101, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v102, v98, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v114, 0x400000, v98
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v38, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v50, v49, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v52
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v51, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_add3_u32 v102, v102, v98, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v50, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v48, v51, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v51
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v52, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v49, v50, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v50
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v53
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v52, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v49, v52, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v52
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v52, 0x40c00000, v54 :: v_dual_add_f32 v53, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v25, v48, v49 :: v_dual_and_b32 v26, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v50, v51, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v51
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v53, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v52, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v50, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v53
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v54, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v51, v52, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v52
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v49.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v50, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v53, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v55, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v54, 0x40c00000, v64 :: v_dual_add_f32 v53, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v50, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v52, v55, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v55
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v54, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v52, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v53
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v51, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v55, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v65
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v64, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v54, v55, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v53, v64, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v64
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v66, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v65, 0x40c00000, v29 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v52, v53, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v54, v55, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v55
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v66
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v54, v65, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v64, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v54, v65, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v65
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v65, 0x40c00000, v67 :: v_dual_add_f32 v66, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v30, v53, v54 :: v_dual_and_b32 v31, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v55, v64, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v64
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v65, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v54, v55, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v67, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v64, v65, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v65
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.l, v54.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v66, v67, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v67
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v68, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v67, 0x40c00000, v69 :: v_dual_add_f32 v66, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v65, v68, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v68
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v68, v67, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v69, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v70, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v66, 0x400000, v67
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v68, v67, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v68, v69, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v67, v70, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v64.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v65, v66, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v68, v69, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v66, 0x400000, v69
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v68, 0x40c00000, v68 :: v_dual_cndmask_b32 v65, v65, v66
+; GFX11-TRUE16-NEXT:    v_add3_u32 v66, v67, v70, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v70
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v68, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v65.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v66, v66, v67, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v69, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v65, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v28.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v67, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v70, v68, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v68
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v70, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v67, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v71, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v66
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v27.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v65, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v25.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v68, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v70, v71, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v71
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v80, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v25, 0xffff, v49, v48
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v17.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v68, v69, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v70, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v67
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v66, v26
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v48, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v69, v70 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v71, v80, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v80
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v81, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v69, v70, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v71, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v68
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v0.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v64, v55
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v30.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v70, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v80, v81, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v82, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v29.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v29, 0xffff, v55, v52
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v23.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v70, v71, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v80, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v28, 0xffff, v64, v51
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v71, v80 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v81, v82, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v82
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v83, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v30, 0xffff, v54, v53
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v71, v80, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v81, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v22.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v24, 0xffff, v50, v39
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v23, 0xffff, v51, v38
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v22, 0xffff, v52, v37
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v82, v83, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v83
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v84, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v19.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v80, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v82, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v18.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v71
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v70
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v81, v82 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v83, v84, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v84
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v85, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v69
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v81, v82, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v83, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v80
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v31, 0xffff, v68, v31
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v21, 0xffff, v53, v21
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v20, 0xffff, v36, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v82, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v84, v85, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v85, v86, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v37, v34
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v38, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v82, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v84, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v81
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v17, 0xffff, v39, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v83, v84 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v85, v86, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v86
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v86, v87, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v85, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v83, v84, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v85, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v82
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v84, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v86, v87, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v87
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v86, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v87, v96, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v84, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v85, v86, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v83
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v12, v85, v86 :: v_dual_add_f32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v85, v87, v96, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v86, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v14, v101, v112 :: v_dual_add_f32 v87, 0x40c00000, v97
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v86, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v87, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v98, v102, v114 :: v_dual_add_f32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v103, 0x400000, v87
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-TRUE16-NEXT:    v_add3_u32 v99, v99, v87, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v97, 0x400000, v96
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v113, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v98.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v99, v103, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v101, v113, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v84
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v101, v112, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v87
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v98, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v86, v100, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v85, v97, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v85
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v87, 0x40c00000, v87
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v11
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v17, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v34, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v37, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v37, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v33, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v32, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v34, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v37, v38, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v36, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v34, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v38, v37, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v38, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v38, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v35, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v36, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v37, v21, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v39, v50, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v48, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v48, v51, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v49, v50, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v38, v22, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v39, v23, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v49, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v50, v51, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v50, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v53
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v49, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v51, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v48, v24, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v54, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v49, v25, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v51, v54, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v54
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v50, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v53
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v55, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v53, v54, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v52, v55, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v55
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v51, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v53, v54, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v54
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v54, v55, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v50, v26, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v53, v64, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v51, v27, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v53, v64, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v64
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v52, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v54, v55, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v55
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v54, v65, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v64, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v53, v54, v65, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v65
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v53, v55, v64, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v64
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v64, v65, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v52, v28, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v66, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v53, v29, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v54, v55, v66, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v66
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v66, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v54, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v54, v64, v65, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v65
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v64, v67, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v55, v64, v67, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v64, 0x400000, v67
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v54, v30, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v55, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v67
+; GFX11-FAKE16-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v68, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v0, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v64, v65, v68, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, 0x400000, v68
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v67, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v55, v31, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v66, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v68, v67, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v67
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v64, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v65, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v68, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v69, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v66, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v68, v69, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v69
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v65, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v66, v66, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v68, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v66, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v69, v70, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v70
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v69, v70, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v69, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v68, v69, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v69, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v67, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v70, v71, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v69, 0x400000, v71
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v68, v70, v71, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v70, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v68, v69, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v69, v70, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v70, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v68, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v80, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v70, 0x400000, v80
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-FAKE16-NEXT:    v_add3_u32 v69, v71, v80, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v69, v69, v70, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v70, v71, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v69, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v80, v81, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v81
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v70, v80, v81, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v80, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v70, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v71, v80, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v70, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v81, v82, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v82
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT:    v_add3_u32 v71, v81, v82, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v81, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v71, v71, v80, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v80, v81, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v81, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v71, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v83, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v81, 0x400000, v83
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v80, v82, v83, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v80, v81, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v81, v82, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v80, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v83, v84, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v84
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-FAKE16-NEXT:    v_add3_u32 v81, v83, v84, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v83, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v81, v82, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v82, v83, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v83, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v81, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v84, v85, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v83, 0x400000, v85
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v82, v84, v85, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v84, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v82, v83, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v83, v84, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v84, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v82, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v86, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v84, 0x400000, v86
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v96, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add3_u32 v83, v85, v86, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v11, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v86, v87, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v97, 0x400000, v87
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
+; GFX11-FAKE16-NEXT:    v_add3_u32 v86, v86, v87, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v98, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v84, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v85, v85, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v86, v97, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v97, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v87, v98, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v98, v99, v84, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v84
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v101, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v97, v101, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v86, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v87, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v98, v100, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v101, 0x400000, v87
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v102, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v99, v99, v87, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101
+; GFX11-FAKE16-NEXT:    v_add3_u32 v101, v102, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v102, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v103, v98, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v98
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v103, v103, v98, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v87, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v113, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v103, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v99, v99, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v99, v113, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v98, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v97, v100, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v84, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v85, v96, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v83, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -99702,532 +112644,1120 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64bf16_to_v64i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v32
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB51_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
-; GFX11-NEXT:    v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v37, v33, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v32
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v10
-; GFX11-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v87, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v16
-; GFX11-NEXT:    v_bfe_u32 v16, v32, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v87, 0x40c00000, v87
-; GFX11-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v34
-; GFX11-NEXT:    v_add3_u32 v16, v16, v32, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v38, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
-; GFX11-NEXT:    v_bfe_u32 v17, v34, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add_f32_e32 v38, 0x40c00000, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v11
-; GFX11-NEXT:    v_add3_u32 v17, v17, v34, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v34, v36, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v17, v39, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v33
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_add3_u32 v33, v34, v36, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v37, v17, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v37, v35, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v18
-; GFX11-NEXT:    v_add3_u32 v34, v37, v35, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v37, v38, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_perm_b32 v17, v33, v17, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v16, v32, v16, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v34, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v34, v37, v38, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v36, v39, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v37, 0x40c00000, v19
-; GFX11-NEXT:    v_or_b32_e32 v35, 0x400000, v38
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_bfe_u32 v38, v37, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v36, v39, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v39
-; GFX11-NEXT:    v_add_f32_e32 v48, 0x40c00000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_add_f32_e32 v39, 0x40c00000, v20
-; GFX11-NEXT:    v_perm_b32 v18, v34, v18, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v35, v38, v37, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v37
-; GFX11-NEXT:    v_bfe_u32 v38, v48, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v48
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v36, v38, v48, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v38, v39, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v36, v38, v39, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v39
-; GFX11-NEXT:    v_bfe_u32 v38, v49, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
-; GFX11-NEXT:    v_bfe_u32 v39, v48, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v50, 0x40c00000, v21
-; GFX11-NEXT:    v_perm_b32 v19, v35, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v37, v38, v49, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v49
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-NEXT:    v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_perm_b32 v20, v36, v20, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v37, v38, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v48
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
-; GFX11-NEXT:    v_bfe_u32 v48, v49, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v51, 0x40c00000, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v39, v50, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_perm_b32 v21, v37, v21, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v38, v39, v50, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v50
-; GFX11-NEXT:    v_add_f32_e32 v50, 0x40c00000, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v38, v39, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v38, v48, v49, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v39, 0x400000, v49
-; GFX11-NEXT:    v_bfe_u32 v48, v51, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
-; GFX11-NEXT:    v_bfe_u32 v49, v50, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v39, v48, v51, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v51
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-NEXT:    v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v39, v48, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v39, v49, v50, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v50
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
-; GFX11-NEXT:    v_bfe_u32 v50, v51, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v53, 0x40c00000, v24
-; GFX11-NEXT:    v_perm_b32 v22, v38, v22, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v49, v52, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v23, v39, v23, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v48, v49, v52, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v52
-; GFX11-NEXT:    v_add_f32_e32 v52, 0x40c00000, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v48, v49, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v48, v50, v51, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v51
-; GFX11-NEXT:    v_bfe_u32 v50, v53, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
-; GFX11-NEXT:    v_bfe_u32 v51, v52, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v49, v50, v53, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v53
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-NEXT:    v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v49, v50, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v49, v51, v52, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v52
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
-; GFX11-NEXT:    v_bfe_u32 v52, v53, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v55, 0x40c00000, v26
-; GFX11-NEXT:    v_perm_b32 v24, v48, v24, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v51, v54, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v25, v49, v25, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v50, v51, v54, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v51, 0x400000, v54
-; GFX11-NEXT:    v_add_f32_e32 v54, 0x40c00000, v27
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v50, v51, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v51, 0x400000, v53
-; GFX11-NEXT:    v_bfe_u32 v52, v55, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
-; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GFX11-NEXT:    v_bfe_u32 v53, v54, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX11-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v51, v52, v55, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v55
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-NEXT:    v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v51, v52, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v51, v53, v54, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v54
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
-; GFX11-NEXT:    v_bfe_u32 v54, v55, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v65, 0x40c00000, v28
-; GFX11-NEXT:    v_perm_b32 v26, v50, v26, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v53, v64, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v27, v51, v27, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v52, v53, v64, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v64
-; GFX11-NEXT:    v_add_f32_e32 v64, 0x40c00000, v29
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v52, v53, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v52, v54, v55, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v53, 0x400000, v55
-; GFX11-NEXT:    v_bfe_u32 v54, v65, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; GFX11-NEXT:    v_bfe_u32 v55, v64, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v53, v54, v65, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v65
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-NEXT:    v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v53, v54, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v53, v55, v64, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v64
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
-; GFX11-NEXT:    v_bfe_u32 v64, v65, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v67, 0x40c00000, v30
-; GFX11-NEXT:    v_perm_b32 v28, v52, v28, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v55, v66, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v29, v53, v29, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v54, v55, v66, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v66
-; GFX11-NEXT:    v_add_f32_e32 v66, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v54, v55, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v54, v64, v65, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v55, 0x400000, v65
-; GFX11-NEXT:    v_bfe_u32 v64, v67, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
-; GFX11-NEXT:    v_bfe_u32 v65, v66, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v55, v64, v67, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v67
-; GFX11-NEXT:    v_add_f32_e32 v68, 0x40c00000, v31
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1
-; GFX11-NEXT:    v_perm_b32 v30, v54, v30, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v55, v64, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v67, 0x40c00000, v67
-; GFX11-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v66
-; GFX11-NEXT:    v_bfe_u32 v65, v68, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
-; GFX11-NEXT:    v_bfe_u32 v66, v0, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v55, v55, v64, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v64, v65, v68, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v65, 0x400000, v68
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
-; GFX11-NEXT:    v_bfe_u32 v68, v67, 16, 1
-; GFX11-NEXT:    v_perm_b32 v31, v55, v31, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v64, v64, v65, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v65, v66, v0, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_add3_u32 v65, v68, v67, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v67
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
-; GFX11-NEXT:    v_perm_b32 v0, v0, v64, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v68, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v67, 0x400000, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v65, v65, v66, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v66, v68, v1, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v68, v69, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v66, v67, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v66, v68, v69, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v67, 0x400000, v69
-; GFX11-NEXT:    v_bfe_u32 v68, v2, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
-; GFX11-NEXT:    v_perm_b32 v1, v1, v65, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v66, v66, v67, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v67, v68, v2, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_perm_b32 v2, v2, v66, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v69, v70, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v70
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
-; GFX11-NEXT:    v_add3_u32 v67, v69, v70, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v69, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v68, v69, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v69, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v67, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v70, v71, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v69, 0x400000, v71
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v68, v70, v71, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v70, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v68, v68, v69, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v69, v70, v4, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_perm_b32 v4, v4, v68, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v71, v80, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v80
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
-; GFX11-NEXT:    v_add3_u32 v69, v71, v80, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v71, v5, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v69, v69, v70, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v70, v71, v5, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v71, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v5, v5, v69, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v80, v81, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v71, 0x400000, v81
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v70, v80, v81, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v80, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v70, v70, v71, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v71, v80, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_perm_b32 v6, v6, v70, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v81, v82, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v82
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
-; GFX11-NEXT:    v_add3_u32 v71, v81, v82, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v81, v7, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v71, v71, v80, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v80, v81, v7, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v81, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v71, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v82, v83, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v81, 0x400000, v83
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v80, v82, v83, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v82, v8, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v80, v80, v81, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v81, v82, v8, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_perm_b32 v8, v8, v80, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v83, v84, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v84
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-NEXT:    v_add3_u32 v81, v83, v84, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v83, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v81, v81, v82, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v82, v83, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v83, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v9, v9, v81, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v84, v85, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v83, 0x400000, v85
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v82, v84, v85, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v84, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v82, v82, v83, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v83, v84, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_perm_b32 v10, v10, v82, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v85, v86, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v86
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
-; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v11
-; GFX11-NEXT:    v_add3_u32 v83, v85, v86, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v85, v11, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13
-; GFX11-NEXT:    v_bfe_u32 v86, v87, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_or_b32_e32 v97, 0x400000, v87
-; GFX11-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
-; GFX11-NEXT:    v_add3_u32 v86, v86, v87, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_bfe_u32 v98, v12, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v84, 16, 1
-; GFX11-NEXT:    v_add3_u32 v85, v85, v11, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v86, v86, v97, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v97, 0x400000, v12
-; GFX11-NEXT:    v_add3_u32 v87, v98, v12, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v98, v99, v84, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v99, 16, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v84
-; GFX11-NEXT:    v_bfe_u32 v101, v13, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v97, v101, v13, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v12, v12, v86, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v99, v87, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v84, v98, v100, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v101, 0x400000, v87
-; GFX11-NEXT:    v_bfe_u32 v102, v14, 16, 1
-; GFX11-NEXT:    v_add3_u32 v99, v99, v87, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
-; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101
-; GFX11-NEXT:    v_add3_u32 v101, v102, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v103, v98, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v98
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_add3_u32 v103, v103, v98, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v14, v14, v87, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v99, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v113, 0x400000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v98, v103, v112, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v99, v99, v15, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v99, v113, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v98, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v97, v100, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_perm_b32 v13, v13, v84, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v85, v96, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v11, v11, v83, 0x7060302
-; GFX11-NEXT:  .LBB51_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64bf16_to_v64i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v14
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v17 :: v_dual_add_f32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v33, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v35, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v35
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v17, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v39, v33, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v48, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v102, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v32.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v36, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v38, v32, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v35.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v36, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v35, 0x40c00000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v35, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v33, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v34, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v35, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v33.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v37, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v34, v34, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v35
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_cndmask_b32 v34, v34, v38
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.l, v34.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v36, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v37, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v36
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v49, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v37, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v37
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v35.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v38, v36 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v48
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v21, v37, v49 :: v_dual_and_b32 v22, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v51, 0x40c00000, v22 :: v_dual_lshlrev_b32 v48, 16, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v38
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v22, v39, v50 :: v_dual_and_b32 v23, 0xffff0000, v23
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v39, v51, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v51
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v38
+; GFX11-TRUE16-NEXT:    v_add3_u32 v39, v39, v51, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v38, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v21.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v39, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v48, 0x40c00000, v48 :: v_dual_cndmask_b32 v23, v37, v50
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v49, v48, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v48
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v52, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v52
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT:    v_add3_u32 v49, v49, v48, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v52, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v39, 0x40c00000, v39 :: v_dual_cndmask_b32 v38, v38, v50
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v53, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v39, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v38.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v24, v49, v51 :: v_dual_lshlrev_b32 v49, 16, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v48, v48, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v51, 0x400000, v53
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v50, v39, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v49 :: v_dual_cndmask_b32 v48, v48, v51
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v48.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v50, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v50, v54, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v26
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v49, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v52, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT:    v_add3_u32 v50, v50, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v48
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v49, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v49
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v50, v50, v52, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v49, v51, v48 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v53
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v52, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v51, v54, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v26.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v53, 0x40c00000, v53 :: v_dual_add_f32 v50, 0x40c00000, v50
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v52, v55, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v50, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v52, v64, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v54, 0x400000, v64
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v50
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v50, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v52, v52, v64, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v53
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v52, v52, v54, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v54, v53, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v54, v54, v53, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v51, v55, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v51, v65, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v55, 0x400000, v65
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v51, v51, v65, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_cndmask_b32 v51, v51, v55
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v53, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.l, v51.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v54, v64, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v53, v53, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v30
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v53, v53, v64 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v52, 0x40c00000, v52 :: v_dual_add_f32 v67, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v52, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v52
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v55, v52, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.l, v53.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v30, v55, v65 :: v_dual_and_b32 v31, 0xffff0000, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v55, v67, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v65, 0x400000, v67
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v53 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v55, v55, v67, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v55, v55, v65 :: v_dual_add_f32 v54, 0x40c00000, v54
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v54, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v53, 0x400000, v54
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v54, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v54, v64, v53, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v55.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v55
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v69, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v64, v67, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v65, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v55, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v67, 0x400000, v69
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v31.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v65, v68, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v65, v69, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v55, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v55
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v65, v65, v69, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v65, v65, v67, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v65, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v70, 0x40c00000, v1 :: v_dual_cndmask_b32 v1, v64, v68
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v66 :: v_dual_add_f32 v65, 0x40c00000, v65
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v64, v70, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v70
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v67, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_add3_u32 v64, v64, v70, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v71, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v67, v67, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v65
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v64, v64, v68, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v66, v71, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v68, v65, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.l, v64.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v67, v69 :: v_dual_lshlrev_b32 v67, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v66, v66, v71, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v69, 0x400000, v71
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v68, v68, v65, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v66, v66, v69 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.l, v66.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v66, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v68, v70, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v67, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v71, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v69, v67, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v67
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v3, v70, v71
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v66, 0x40c00000, v66
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v69, v80, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v66, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v66
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v69, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v66, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v69, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v71, 0x40c00000, v71
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v66, v70, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v69, v71, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v81, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v69, v69, v71, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v71
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v70, v81, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v80, 0x40c00000, v80
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v66
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v69, v69, v82, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v71, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v70, v80, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v80
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v71, v82 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v70, v70, v80, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v81, 0x40c00000, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v70, v70, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v71, v81, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v80, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v83, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v71, v71, v81, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v81
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v69
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v83, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v82, 0x40c00000, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v67
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v68
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v71, v71, v84, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v81, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v80, v82, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v81, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v80, v80, v82, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v83, 0x40c00000, v83
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v80, v80, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v81, v83, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v82, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v85, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v81, v81, v83, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v83
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v70
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v82, v85, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v81, v86, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v82, v84, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v83, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v84
+; GFX11-TRUE16-NEXT:    v_add3_u32 v82, v82, v84, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v65, 16, v67
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v83, v86, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v85, 0x40c00000, v85
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v82, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v83, v85, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v84, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v83, v83, v85, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v64, 16, v68
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v55, 16, v69
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v30
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v83, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v85, 0x40c00000, v85
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v28
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v84, v96, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v84, v86, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v86
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v66
+; GFX11-TRUE16-NEXT:    v_add3_u32 v84, v84, v86, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v96, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v96, v85, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v84, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v29, v52, 16, v55
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v28, v51, 16, v64
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v27, v50, 16, v65
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v97, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v86, v96, v85, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v96, 0x40c00000, v98
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v87, 0x400000, v85
+; GFX11-TRUE16-NEXT:    v_add3_u32 v97, v97, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_add_f32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v99, v96, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v96
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v96, v96
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v103, v98, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v112, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v99, v99, v96, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v113, 0x400000, v98
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v99, v101, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v99, v102, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v101, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v102, v103, v98, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v103, v112, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v96
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v99, v101, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v83.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v14.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v103, v112, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v102, v113, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v30, v53, 16, v54
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v98
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v97, v100, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v15.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v25, v48, 16, v49
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v24, v39, 16, v50
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v23, v38, 16, v51
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v86, v87, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v86.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v13, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v12, 16, v96
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v85
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v52
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v87, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v86, 16, v84
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v83, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v82
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v81
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v80
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v7, 16, v71
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v31, v31, 16, v70
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v66
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v21, 16, v53
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v20, v35, 16, v36
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v34, 16, v37
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v33, 16, v38
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v17, v32, 16, v39
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v16, 16, v48
+; GFX11-TRUE16-NEXT:  .LBB51_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64bf16_to_v64i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v81, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v33, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v37, v33, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v32, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v87, 0x40c00000, v87
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v32, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v36, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v38, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v11
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v36, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v17, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v33
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v34, v36, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v37, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v35, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v35
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v33, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v37, v35, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v38, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v17, v33, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v32, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v34, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v37, v38, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v36, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v37, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v35, 0x400000, v38
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v37, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v34, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v36, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v48, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v39, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v18, v34, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v38, v37, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v37
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v35, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v38, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v39, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v36, v38, v39, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v39
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v48, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v19, v35, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v36, v36, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v20, v36, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v39, v48, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v48
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v49, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v51, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v37, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_perm_b32 v21, v37, v21, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v39, v50, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v50, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v38, v48, v49, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v39, 0x400000, v49
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v48, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v50, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v38, v38, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v48, v51, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v49, v50, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v50
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v51, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v53, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_perm_b32 v22, v38, v22, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v39, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v49, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v23, v39, v23, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v49, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v52, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v48, v50, v51, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v51
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v50, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v52, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v48, v48, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v50, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v53
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v49, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v49, v51, v52, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v50, 0x400000, v52
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v53, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v55, 0x40c00000, v26
+; GFX11-FAKE16-NEXT:    v_perm_b32 v24, v48, v24, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v49, v49, v50, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v51, v54, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v25, v49, v25, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v51, v54, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v54
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v54, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v50, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v50, v52, v53, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v51, 0x400000, v53
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v52, v55, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v53, v54, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v50, v50, v51, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v52, v55, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v55
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v27, v51, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v51, v53, v54, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v52, 0x400000, v54
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v54, v55, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v65, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v50, v26, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v51, v51, v52, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v53, v64, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v51, v27, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v53, v64, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v64
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v64, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v28, v52, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v52, v54, v55, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v53, 0x400000, v55
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v54, v65, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v64, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v52, v52, v53, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v53, v54, v65, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v65
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v29, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v53, v55, v64, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v54, 0x400000, v64
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v64, v65, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v52, v28, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v53, v53, v54, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v55, v66, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v53, v29, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v54, v55, v66, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v66
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v66, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v54, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v54, v64, v65, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v55, 0x400000, v65
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v64, v67, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v65, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v66, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v54, v54, v55, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v55, v64, v67, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v64, 0x400000, v67
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v68, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v30, v54, v30, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v55, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v67, 0x40c00000, v67
+; GFX11-FAKE16-NEXT:    v_add3_u32 v55, v65, v66, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v64, 0x400000, v66
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v65, v68, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v66, v66
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v66, v0, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v55, v55, v64, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v64, v65, v68, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v65, 0x400000, v68
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v68, v68
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v67, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v31, v55, v31, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v64, v64, v65, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v66, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v65, v68, v67, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v67
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v67, v67
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v64, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v65, v65, v66, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v68, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v69, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v66, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v66, v68, v69, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, 0x400000, v69
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v68, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v69, v69
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v65, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v66, v66, v67, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v68, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v66, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v69, v70, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v70
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v70, v70
+; GFX11-FAKE16-NEXT:    v_add3_u32 v67, v69, v70, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v69, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v67, v67, v68, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v68, v69, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v69, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v67, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v70, v71, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v69, 0x400000, v71
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v71, v71
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v68, v70, v71, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v70, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v68, v68, v69, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v69, v70, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v70, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v68, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v80, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v70, 0x400000, v80
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v80, v80
+; GFX11-FAKE16-NEXT:    v_add3_u32 v69, v71, v80, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v71, v5, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v69, v69, v70, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v70, v71, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v69, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v80, v81, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v71, 0x400000, v81
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v81, v81
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v70, v80, v81, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v80, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v70, v70, v71, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v71, v80, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v70, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v81, v82, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v82
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v82, v82
+; GFX11-FAKE16-NEXT:    v_add3_u32 v71, v81, v82, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v81, v7, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v71, v71, v80, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v80, v81, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v81, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v71, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v83, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v81, 0x400000, v83
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v83, v83
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v80, v82, v83, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v82, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v80, v80, v81, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v81, v82, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v80, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v83, v84, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v84
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-FAKE16-NEXT:    v_add3_u32 v81, v83, v84, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v83, v9, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v81, v81, v82, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v82, v83, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v83, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v81, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v84, v85, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v83, 0x400000, v85
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v85, v85
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v82, v84, v85, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v84, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v82, v82, v83, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v83, v84, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v84, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v82, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v86, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v84, 0x400000, v86
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v86, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v96, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add3_u32 v83, v85, v86, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v85, v11, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v86, v87, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v97, 0x400000, v87
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v84, 0x40c00000, v84
+; GFX11-FAKE16-NEXT:    v_add3_u32 v86, v86, v87, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v98, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v84, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v85, v85, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v86, v86, v97, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v97, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v87, v98, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v98, v99, v84, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v84
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v101, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v84, v84
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v97, v101, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v86, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v87, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v84, v98, v100, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v101, 0x400000, v87
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v102, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v99, v99, v87, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v87, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101
+; GFX11-FAKE16-NEXT:    v_add3_u32 v101, v102, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v102, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v103, v98, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v98
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v103, v103, v98, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v98, v98
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v87, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v99, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v113, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v98, v103, v112, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v99, v99, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v99, v113, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v98, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v97, v100, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v84, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v85, v96, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v83, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB51_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index c2cac55e13b09..b040e77125770 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v4i32_to_v4f32:
@@ -1371,83 +1372,170 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v4i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB11_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
-; GFX11-NEXT:  .LBB11_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1654,69 +1742,126 @@ define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i32_to_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB12_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB12_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB12_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v19, 3, v19
-; GFX11-NEXT:    v_add_nc_u32_e32 v17, 3, v17
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v18, 3, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB12_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v18
-; GFX11-NEXT:    v_mov_b32_e32 v4, v19
-; GFX11-NEXT:    v_mov_b32_e32 v8, v16
-; GFX11-NEXT:    v_mov_b32_e32 v12, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i32_to_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 3, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v17.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i32_to_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 3, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v17, 3, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v18, 3, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB12_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2032,126 +2177,260 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i8_to_v4i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_4
-; GFX11-NEXT:  .LBB13_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB13_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB13_2
-; GFX11-NEXT:  .LBB13_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v17, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v20, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v13, v10
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i8_to_v4i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-TRUE16-NEXT:  .LBB13_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-TRUE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i8_to_v4i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-FAKE16-NEXT:  .LBB13_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-FAKE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v17, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v13, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3361,83 +3640,170 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v4f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB23_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
-; GFX11-NEXT:  .LBB23_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v4f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v4f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3644,67 +4010,122 @@ define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4f32_to_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB24_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v16, 1.0, v16
-; GFX11-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v18, 1.0, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB24_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v18
-; GFX11-NEXT:    v_mov_b32_e32 v4, v19
-; GFX11-NEXT:    v_mov_b32_e32 v8, v16
-; GFX11-NEXT:    v_mov_b32_e32 v12, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4f32_to_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB24_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB24_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v17.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4f32_to_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v16, 1.0, v16
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4020,126 +4441,260 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i8_to_v4f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_4
-; GFX11-NEXT:  .LBB25_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB25_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB25_2
-; GFX11-NEXT:  .LBB25_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v17, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v20, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v13, v10
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i8_to_v4f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-TRUE16-NEXT:  .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i8_to_v4f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-FAKE16-NEXT:  .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v17, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v13, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5196,83 +5751,170 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v2i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB33_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
-; GFX11-NEXT:  .LBB33_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5479,69 +6121,126 @@ define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2i64_to_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB34_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_co_u32 v16, vcc_lo, v16, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v18, vcc_lo, v18, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB34_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v18
-; GFX11-NEXT:    v_mov_b32_e32 v4, v19
-; GFX11-NEXT:    v_mov_b32_e32 v8, v16
-; GFX11-NEXT:    v_mov_b32_e32 v12, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2i64_to_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB34_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB34_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB34_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v17.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2i64_to_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB34_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v16, vcc_lo, v16, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB34_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5857,126 +6556,260 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i8_to_v2i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_4
-; GFX11-NEXT:  .LBB35_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB35_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB35_2
-; GFX11-NEXT:  .LBB35_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v17, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v20, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v13, v10
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i8_to_v2i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-TRUE16-NEXT:  .LBB35_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-TRUE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i8_to_v2i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-FAKE16-NEXT:  .LBB35_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-FAKE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v17, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v13, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -6845,83 +7678,170 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v2f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB41_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX11-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
-; GFX11-NEXT:  .LBB41_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v2f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v13, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v7, v0
+; GFX11-TRUE16-NEXT:  .LBB41_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v2f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v9, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v10, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v6, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB41_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7126,66 +8046,120 @@ define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2f64_to_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB42_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB42_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB42_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; GFX11-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB42_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v18
-; GFX11-NEXT:    v_mov_b32_e32 v4, v19
-; GFX11-NEXT:    v_mov_b32_e32 v8, v16
-; GFX11-NEXT:    v_mov_b32_e32 v12, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2f64_to_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB42_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB42_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB42_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB42_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v17.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2f64_to_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB42_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB42_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB42_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB42_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7501,126 +8475,260 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i8_to_v2f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_4
-; GFX11-NEXT:  .LBB43_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB43_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB43_2
-; GFX11-NEXT:  .LBB43_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v17, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v20, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v13, v10
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i8_to_v2f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-TRUE16-NEXT:  .LBB43_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-TRUE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i8_to_v2f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-FAKE16-NEXT:  .LBB43_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-FAKE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v17, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v13, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -8297,82 +9405,171 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v8i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB47_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v13, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v8 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v8, v11, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_bfe_u32 v12, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v13, v7, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v14, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v12, v2, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v11, v13, v7, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v13, v14, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_perm_b32 v3, v7, v3, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
-; GFX11-NEXT:  .LBB47_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_cndmask_b32 v1, v7, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v10, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v12, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v6
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v7, v8 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v12, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v7, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v8, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -8635,69 +9832,126 @@ define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i16_to_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB48_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB48_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v18
-; GFX11-NEXT:    v_mov_b32_e32 v4, v19
-; GFX11-NEXT:    v_mov_b32_e32 v8, v16
-; GFX11-NEXT:    v_mov_b32_e32 v12, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i16_to_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB48_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB48_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v17.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i16_to_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB48_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB48_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -9026,126 +10280,260 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i8_to_v8i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_4
-; GFX11-NEXT:  .LBB49_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB49_2
-; GFX11-NEXT:  .LBB49_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v17, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v20, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v13, v10
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i8_to_v8i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-TRUE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i8_to_v8i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-FAKE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v17, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v13, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -9578,82 +10966,169 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v8f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB51_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v7, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v13, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_bfe_u32 v11, v6, 16, 1
-; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v7, v8 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v8, v11, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_bfe_u32 v12, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v13, v7, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v14, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v12, v2, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v11, v13, v7, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v13, v14, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
-; GFX11-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_perm_b32 v3, v7, v3, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
-; GFX11-NEXT:  .LBB51_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v8f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v1, v7, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v10, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v7, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v4
+; GFX11-TRUE16-NEXT:  .LBB51_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v8f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v13, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v7, v8 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v12, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v7, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v8, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v4, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB51_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -9915,69 +11390,126 @@ define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8f16_to_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB52_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB52_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB52_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB52_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v18
-; GFX11-NEXT:    v_mov_b32_e32 v4, v19
-; GFX11-NEXT:    v_mov_b32_e32 v8, v16
-; GFX11-NEXT:    v_mov_b32_e32 v12, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8f16_to_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB52_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB52_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB52_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[16:17]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB52_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v17.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8f16_to_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB52_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB52_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB52_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB52_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -10289,126 +11821,260 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i8_to_v8f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_4
-; GFX11-NEXT:  .LBB53_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB53_2
-; GFX11-NEXT:  .LBB53_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v17, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v20, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v13, v10
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i8_to_v8f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-TRUE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i8_to_v8f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-FAKE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v17, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v13, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -10790,138 +12456,272 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v12, v17
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8bf16_to_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
-; GFX11-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB54_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
-; GFX11-NEXT:  .LBB54_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB54_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v4
-; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v7, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v1, v11, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_bfe_u32 v9, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v4, vcc_lo
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v1, 16, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v9, v4, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v9, v4, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v0, v1, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v10, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v0, v0, v1, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v8, v6, v4, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v10, v10, v7, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_perm_b32 v1, v5, v12, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v0, v13, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_perm_b32 v7, v7, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:  .LBB54_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v18
-; GFX11-NEXT:    v_mov_b32_e32 v4, v19
-; GFX11-NEXT:    v_mov_b32_e32 v8, v16
-; GFX11-NEXT:    v_mov_b32_e32 v12, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8bf16_to_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[2:3]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v11.h
+; GFX11-TRUE16-NEXT:  .LBB54_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v14, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v2, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v14, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v10, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v5, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v7, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v9, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[2:3]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  .LBB54_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8bf16_to_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB54_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[16:17]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[18:19]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v18
+; GFX11-FAKE16-NEXT:  .LBB54_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v18
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v7, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v1, v11, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v1, 16, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v9, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v0, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v0, v0, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v6, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v5, v12, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v0, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:  .LBB54_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v19
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -11249,126 +13049,260 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i8_to_v8bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_4
-; GFX11-NEXT:  .LBB55_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB55_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v20
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v9
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v11
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB55_2
-; GFX11-NEXT:  .LBB55_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v17, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v19, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v20, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v11, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v13, v10
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i8_to_v8bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-TRUE16-NEXT:  .LBB55_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v6, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-TRUE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v7.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v8.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v4.h, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v8, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i8_to_v8bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-FAKE16-NEXT:  .LBB55_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-FAKE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v17, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v19, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v11, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v13, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index b0627c3d4e77d..1db0cccfe6b72 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define half @bitcast_i16_to_f16(i16 %a, i32 %b) {
 ; GCN-LABEL: bitcast_i16_to_f16:
@@ -58,19 +59,41 @@ define half @bitcast_i16_to_f16(i16 %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_i16_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, 3
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_i16_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB0_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v0.l, 3
+; GFX11-TRUE16-NEXT:  .LBB0_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_i16_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, 3
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -132,19 +155,41 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_f16_to_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_add_f16_e32 v0, 0x200, v0
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_f16_to_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB1_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, 0x200, v0.l
+; GFX11-TRUE16-NEXT:  .LBB1_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_f16_to_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 0x200, v0
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -205,19 +250,41 @@ define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_i16_to_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, 3
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_i16_to_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v0.l, 3
+; GFX11-TRUE16-NEXT:  .LBB2_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_i16_to_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, 3
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -305,30 +372,61 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_bf16_to_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB3_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:  .LBB3_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_bf16_to_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB3_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:  .LBB3_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_bf16_to_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB3_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:  .LBB3_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -401,19 +499,41 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_f16_to_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_add_f16_e32 v0, 0x200, v0
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_f16_to_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB4_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, 0x200, v0.l
+; GFX11-TRUE16-NEXT:  .LBB4_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_f16_to_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 0x200, v0
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -503,30 +623,61 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_bf16_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB5_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:  .LBB5_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_bf16_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB5_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:  .LBB5_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_bf16_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB5_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:  .LBB5_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -543,3 +694,5 @@ end:
   %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ]
   ret half %phi
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index b52d8a89035bc..edeb780d481c4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v8i32_to_v8f32:
@@ -1918,148 +1919,304 @@ define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v8i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v8
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB11_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v15, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    v_bfe_u32 v13, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v10
-; GFX11-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v14, v11, 16, 1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
-; GFX11-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_bfe_u32 v16, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v12
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
-; GFX11-NEXT:  .LBB11_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v15, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v8, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v14, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v11, v14 :: v_dual_add_f32 v11, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v8
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v12, v13 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v15, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v14, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v9
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v14 :: v_dual_lshlrev_b32 v14, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v18, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v15, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v12, v0
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2402,114 +2559,209 @@ define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v36
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i32_to_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
-; GFX11-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB12_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB12_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB12_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v39, 3, v39
-; GFX11-NEXT:    v_add_nc_u32_e32 v37, 3, v37
-; GFX11-NEXT:    v_add_nc_u32_e32 v35, 3, v35
-; GFX11-NEXT:    v_add_nc_u32_e32 v33, 3, v33
-; GFX11-NEXT:    v_add_nc_u32_e32 v32, 3, v32
-; GFX11-NEXT:    v_add_nc_u32_e32 v34, 3, v34
-; GFX11-NEXT:    v_add_nc_u32_e32 v36, 3, v36
-; GFX11-NEXT:    v_add_nc_u32_e32 v38, 3, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB12_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v38
-; GFX11-NEXT:    v_mov_b32_e32 v4, v39
-; GFX11-NEXT:    v_mov_b32_e32 v8, v36
-; GFX11-NEXT:    v_mov_b32_e32 v12, v37
-; GFX11-NEXT:    v_mov_b32_e32 v16, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v35
-; GFX11-NEXT:    v_mov_b32_e32 v24, v32
-; GFX11-NEXT:    v_mov_b32_e32 v28, v33
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i32_to_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v5 :: v_dual_mov_b32 v26, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v3 :: v_dual_mov_b32 v18, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v32, 3, v32
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v27, 3, v27
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v26, 3, v26
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v19, 3, v19
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v18, 3, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v33.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i32_to_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v39, 3, v39
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v37, 3, v37
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v35, 3, v35
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v33, 3, v33
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v32, 3, v32
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v34, 3, v34
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v36, 3, v36
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v38, 3, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB12_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v38
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v36
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v33
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3066,226 +3318,459 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v8i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v38, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_4
-; GFX11-NEXT:  .LBB13_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB13_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v49
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v50
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v38
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v30
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v35
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v36
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v37
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v11, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v12, v16, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB13_2
-; GFX11-NEXT:  .LBB13_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v51, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v50, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v48, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v38, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v39, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v6, v18, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, v12, 3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_add_nc_u16 v4, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v30, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v3, v35, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v37, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v19, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v21, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v14
-; GFX11-NEXT:    v_or_b32_e32 v12, v17, v16
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v8i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-TRUE16-NEXT:  .LBB13_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-TRUE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v8i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v38, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-FAKE16-NEXT:  .LBB13_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-FAKE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v51, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v50, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v48, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v38, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v39, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v18, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v12, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v30, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v35, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v37, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v19, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v21, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v17, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4996,148 +5481,304 @@ define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v8f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v8
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB23_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v15, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    v_bfe_u32 v13, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v10
-; GFX11-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v14, v11, 16, 1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
-; GFX11-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_bfe_u32 v16, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v12
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
-; GFX11-NEXT:  .LBB23_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v8f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v15, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v8, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v14, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v11, v14 :: v_dual_add_f32 v11, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v8
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v12, v13 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v15, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v14, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v9
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v14 :: v_dual_lshlrev_b32 v14, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v18, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v15, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v12, v0
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v8f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5480,112 +6121,205 @@ define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v36
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8f32_to_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
-; GFX11-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB24_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v39, 1.0, v39 :: v_dual_add_f32 v32, 1.0, v32
-; GFX11-NEXT:    v_dual_add_f32 v37, 1.0, v37 :: v_dual_add_f32 v34, 1.0, v34
-; GFX11-NEXT:    v_dual_add_f32 v35, 1.0, v35 :: v_dual_add_f32 v36, 1.0, v36
-; GFX11-NEXT:    v_dual_add_f32 v33, 1.0, v33 :: v_dual_add_f32 v38, 1.0, v38
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB24_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v38
-; GFX11-NEXT:    v_mov_b32_e32 v4, v39
-; GFX11-NEXT:    v_mov_b32_e32 v8, v36
-; GFX11-NEXT:    v_mov_b32_e32 v12, v37
-; GFX11-NEXT:    v_mov_b32_e32 v16, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v35
-; GFX11-NEXT:    v_mov_b32_e32 v24, v32
-; GFX11-NEXT:    v_mov_b32_e32 v28, v33
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8f32_to_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v5 :: v_dual_mov_b32 v26, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v3 :: v_dual_mov_b32 v18, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB24_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v32, 1.0, v32
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v33, 1.0, v33 :: v_dual_add_f32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v26, 1.0, v26
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v18, 1.0, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB24_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v33.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8f32_to_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v39, 1.0, v39 :: v_dual_add_f32 v32, 1.0, v32
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v37, 1.0, v37 :: v_dual_add_f32 v34, 1.0, v34
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v35, 1.0, v35 :: v_dual_add_f32 v36, 1.0, v36
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v33, 1.0, v33 :: v_dual_add_f32 v38, 1.0, v38
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v38
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v36
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v33
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -6142,226 +6876,459 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v8f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v38, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_4
-; GFX11-NEXT:  .LBB25_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB25_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v49
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v50
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v38
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v30
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v35
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v36
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v37
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v11, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v12, v16, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB25_2
-; GFX11-NEXT:  .LBB25_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v51, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v50, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v48, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v38, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v39, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v6, v18, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, v12, 3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_add_nc_u16 v4, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v30, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v3, v35, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v37, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v19, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v21, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v14
-; GFX11-NEXT:    v_or_b32_e32 v12, v17, v16
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v8f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-TRUE16-NEXT:  .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v8f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v38, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-FAKE16-NEXT:  .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v51, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v50, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v48, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v38, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v39, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v18, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v12, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v30, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v35, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v37, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v19, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v21, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v17, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7903,148 +8870,304 @@ define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v4i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v8
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB33_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v15, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    v_bfe_u32 v13, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v10
-; GFX11-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v14, v11, 16, 1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
-; GFX11-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_bfe_u32 v16, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v12
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
-; GFX11-NEXT:  .LBB33_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v15, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v8, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v14, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v11, v14 :: v_dual_add_f32 v11, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v8
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v12, v13 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v15, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v14, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v9
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v14 :: v_dual_lshlrev_b32 v14, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v18, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v15, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v12, v0
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -8387,117 +9510,215 @@ define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v36
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i64_to_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
-; GFX11-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB34_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_co_u32 v36, vcc_lo, v36, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v37, null, 0, v37, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v34, vcc_lo, v34, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v35, null, 0, v35, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v32, vcc_lo, v32, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v33, null, 0, v33, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v38, vcc_lo, v38, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v39, null, 0, v39, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB34_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v38
-; GFX11-NEXT:    v_mov_b32_e32 v4, v39
-; GFX11-NEXT:    v_mov_b32_e32 v8, v36
-; GFX11-NEXT:    v_mov_b32_e32 v12, v37
-; GFX11-NEXT:    v_mov_b32_e32 v16, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v35
-; GFX11-NEXT:    v_mov_b32_e32 v24, v32
-; GFX11-NEXT:    v_mov_b32_e32 v28, v33
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i64_to_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v5 :: v_dual_mov_b32 v26, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v3 :: v_dual_mov_b32 v18, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB34_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB34_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v18, vcc_lo, v18, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v26, vcc_lo, v26, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v32, vcc_lo, v32, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v33, null, 0, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v10, vcc_lo, v10, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB34_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v33.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i64_to_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB34_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v36, vcc_lo, v36, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v37, null, 0, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v34, vcc_lo, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v35, null, 0, v35, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v32, vcc_lo, v32, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v33, null, 0, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v38, vcc_lo, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v39, null, 0, v39, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB34_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v38
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v36
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v33
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -9054,226 +10275,459 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v4i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v38, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_4
-; GFX11-NEXT:  .LBB35_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB35_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v49
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v50
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v38
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v30
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v35
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v36
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v37
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v11, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v12, v16, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB35_2
-; GFX11-NEXT:  .LBB35_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v51, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v50, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v48, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v38, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v39, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v6, v18, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, v12, 3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_add_nc_u16 v4, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v30, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v3, v35, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v37, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v19, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v21, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v14
-; GFX11-NEXT:    v_or_b32_e32 v12, v17, v16
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v4i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-TRUE16-NEXT:  .LBB35_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-TRUE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v4i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v38, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-FAKE16-NEXT:  .LBB35_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-FAKE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v51, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v50, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v48, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v38, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v39, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v18, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v12, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v30, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v35, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v37, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v19, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v21, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v17, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -10571,148 +12025,304 @@ define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v4f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v8
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB41_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v15, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT:    v_bfe_u32 v12, v10, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v14, v9, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v9
-; GFX11-NEXT:    v_bfe_u32 v13, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_bfe_u32 v14, v10, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v10
-; GFX11-NEXT:    v_bfe_u32 v16, v2, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v14, v11, 16, 1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
-; GFX11-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_bfe_u32 v16, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v12
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
-; GFX11-NEXT:  .LBB41_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v4f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v15, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v12, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v8, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v14, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v11, v14 :: v_dual_add_f32 v11, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v4, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v8
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v12, v13 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v15, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v9, v14 :: v_dual_and_b32 v14, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v9
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v10, v12 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v14 :: v_dual_lshlrev_b32 v14, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v18, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v15, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v12, v0
+; GFX11-TRUE16-NEXT:  .LBB41_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v4f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v15, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v12, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v12, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v8, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v14, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v16, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v14, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v14, v15, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v11, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v13, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v12, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB41_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -11049,112 +12659,205 @@ define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v36
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4f64_to_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
-; GFX11-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB42_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB42_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB42_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[32:33], v[32:33], 1.0
-; GFX11-NEXT:    v_add_f64 v[34:35], v[34:35], 1.0
-; GFX11-NEXT:    v_add_f64 v[36:37], v[36:37], 1.0
-; GFX11-NEXT:    v_add_f64 v[38:39], v[38:39], 1.0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB42_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v38
-; GFX11-NEXT:    v_mov_b32_e32 v4, v39
-; GFX11-NEXT:    v_mov_b32_e32 v8, v36
-; GFX11-NEXT:    v_mov_b32_e32 v12, v37
-; GFX11-NEXT:    v_mov_b32_e32 v16, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v35
-; GFX11-NEXT:    v_mov_b32_e32 v24, v32
-; GFX11-NEXT:    v_mov_b32_e32 v28, v33
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4f64_to_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v5 :: v_dual_mov_b32 v26, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v3 :: v_dual_mov_b32 v18, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB42_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB42_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB42_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[32:33], v[32:33], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[26:27], v[26:27], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB42_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v33.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4f64_to_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB42_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB42_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB42_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[32:33], v[32:33], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[34:35], v[34:35], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[36:37], v[36:37], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[38:39], v[38:39], 1.0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB42_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v38
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v36
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v33
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -11711,226 +13414,459 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v4f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v38, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_4
-; GFX11-NEXT:  .LBB43_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB43_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v51
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v49
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v50
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v48
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v38
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v30
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v35
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v36
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v37
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v11, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v12, v16, v17
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB43_2
-; GFX11-NEXT:  .LBB43_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v51, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v50, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v48, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v38, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v39, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v6, v18, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, v12, 3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_add_nc_u16 v4, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v30, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v3, v35, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v36, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v37, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v19, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v21, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v14
-; GFX11-NEXT:    v_or_b32_e32 v12, v17, v16
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v6, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v4f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-TRUE16-NEXT:  .LBB43_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v15.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v3.h, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v4.h, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-TRUE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v15.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v15.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v16.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v8.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v8.h, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v9.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v9.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v3.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v4.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v15, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v4f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v38, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-FAKE16-NEXT:  .LBB43_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-FAKE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v51, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v50, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v48, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v38, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v39, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v18, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v12, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v30, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v35, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v36, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v37, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v19, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v21, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v17, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -12996,152 +14932,310 @@ define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v16i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v8
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB47_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v12, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v12, v12, v0, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v15 :: v_dual_lshlrev_b32 v15, 16, v4
-; GFX11-NEXT:    v_bfe_u32 v12, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v13, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v12, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v18, v4, 16, 1
-; GFX11-NEXT:    v_perm_b32 v1, v1, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v13, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v13, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v13, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v11, v12 :: v_dual_lshlrev_b32 v12, 16, v5
-; GFX11-NEXT:    v_add3_u32 v14, v14, v15, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add3_u32 v15, v18, v4, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v19, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_add3_u32 v18, v19, v12, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v6
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v15, v17 :: v_dual_add_f32 v15, 0x40c00000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_add3_u32 v17, v21, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v18, v20, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
-; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v21, v22, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT:    v_add3_u32 v23, v23, v18, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v18
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v21, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v23, v24, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v19, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v18, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_perm_b32 v4, v4, v14, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v12, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v11, 0x7060302
-; GFX11-NEXT:  .LBB47_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v16, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v15, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v12, v13 :: v_dual_and_b32 v13, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v8, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v12, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v12, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_add_f32 v12, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v14, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v15, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v15, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_cndmask_b32 v13, v16, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v20, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v20, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v16, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v11, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v20, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v6, 16, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v5, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v4, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v14, 16, v4
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v12, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v12, v15 :: v_dual_lshlrev_b32 v15, 16, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v13, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v12, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v11, v12 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v14, v14, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v15, v18, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v19, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v15, v17 :: v_dual_add_f32 v15, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v21, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v23, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v21, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v19, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v14, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v12, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v11, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -13600,114 +15694,209 @@ define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v36
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i16_to_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
-; GFX11-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB48_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v39, v39, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v37, v37, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v35, v35, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v34, v34, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v36, v36, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v38, v38, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB48_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v38
-; GFX11-NEXT:    v_mov_b32_e32 v4, v39
-; GFX11-NEXT:    v_mov_b32_e32 v8, v36
-; GFX11-NEXT:    v_mov_b32_e32 v12, v37
-; GFX11-NEXT:    v_mov_b32_e32 v16, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v35
-; GFX11-NEXT:    v_mov_b32_e32 v24, v32
-; GFX11-NEXT:    v_mov_b32_e32 v28, v33
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i16_to_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v5 :: v_dual_mov_b32 v26, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v3 :: v_dual_mov_b32 v18, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB48_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB48_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v33.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i16_to_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB48_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v39, v39, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v37, v37, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v35, v35, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v34, v34, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v36, v36, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v38, v38, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB48_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v38
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v36
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v33
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -14296,194 +16485,374 @@ define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v16i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v38, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_4
-; GFX11-NEXT:  .LBB49_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v35
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v38
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v48
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v30
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v49
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v21
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v23
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v25
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v12, v11, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB49_2
-; GFX11-NEXT:  .LBB49_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v20, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v5, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v25, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v19, v3
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v4
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v17, v5
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v15, v0
-; GFX11-NEXT:    v_add_nc_u16 v2, v8, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v39, v1
-; GFX11-NEXT:    v_add_nc_u16 v14, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v13, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v11, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v48, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v38, v4
-; GFX11-NEXT:    v_or_b32_e32 v11, v37, v11
-; GFX11-NEXT:    v_or_b32_e32 v13, v35, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v36, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_perm_b32 v0, v11, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v13, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v15, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v12, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v19.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-TRUE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v20.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v19.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v21.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v13.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v17.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v16.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v14.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v12.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v10.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v11.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v38, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v25
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-FAKE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v30, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v20, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v25, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v19, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v17, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v15, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v8, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v39, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v13, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v11, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v48, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v38, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v37, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v35, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v36, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v13, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v15, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v12, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -15205,152 +17574,308 @@ define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v16f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v8
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB51_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v9, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v14, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v12, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v12, v12, v0, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v12, v15 :: v_dual_lshlrev_b32 v15, 16, v4
-; GFX11-NEXT:    v_bfe_u32 v12, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v13, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v12, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v18, v4, 16, 1
-; GFX11-NEXT:    v_perm_b32 v1, v1, v9, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v13, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v13, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v11, v13, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_bfe_u32 v14, v15, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v13, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v11, v11, v12 :: v_dual_lshlrev_b32 v12, 16, v5
-; GFX11-NEXT:    v_add3_u32 v14, v14, v15, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add3_u32 v15, v18, v4, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v19, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_add3_u32 v18, v19, v12, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v6
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v15, v17 :: v_dual_add_f32 v15, 0x40c00000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_add3_u32 v17, v21, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v18, v20, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
-; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v21, v22, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
-; GFX11-NEXT:    v_add3_u32 v23, v23, v18, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v18
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v21, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v23, v24, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v19, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v7, v7, v18, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_perm_b32 v4, v4, v14, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v5, v5, v12, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v11, 0x7060302
-; GFX11-NEXT:  .LBB51_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v16f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v12 :: v_dual_and_b32 v9, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v14, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v13, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v9, v14, v16 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v14, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v14, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v15, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v16, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v13, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v16, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v13, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v20, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v13, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v23, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v20, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v13, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v18.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v15, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v13, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v11
+; GFX11-TRUE16-NEXT:  .LBB51_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v16f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v12, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v12, v15 :: v_dual_lshlrev_b32 v15, 16, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v13, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v12, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v13, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, v11, v12 :: v_dual_lshlrev_b32 v12, 16, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v14, v14, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v15, v18, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v14, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v19, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v15, v17 :: v_dual_add_f32 v15, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v21, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v19, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v23, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v21, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v15, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v23, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v19, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v14, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v12, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v11, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB51_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -15797,114 +18322,209 @@ define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v36
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16f16_to_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
-; GFX11-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB52_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB52_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB52_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v39, 0x200, v39 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v37, 0x200, v37 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v35, 0x200, v35 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v33, 0x200, v33 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v34, 0x200, v34 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v36, 0x200, v36 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v38, 0x200, v38 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB52_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v38
-; GFX11-NEXT:    v_mov_b32_e32 v4, v39
-; GFX11-NEXT:    v_mov_b32_e32 v8, v36
-; GFX11-NEXT:    v_mov_b32_e32 v12, v37
-; GFX11-NEXT:    v_mov_b32_e32 v16, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v35
-; GFX11-NEXT:    v_mov_b32_e32 v24, v32
-; GFX11-NEXT:    v_mov_b32_e32 v28, v33
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16f16_to_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v5 :: v_dual_mov_b32 v26, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v3 :: v_dual_mov_b32 v18, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v10, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB52_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB52_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB52_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v33, 0x200, v33 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[32:33]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB52_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v27.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v32.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v33.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16f16_to_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB52_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB52_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB52_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v39, 0x200, v39 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v37, 0x200, v37 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v35, 0x200, v35 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v33, 0x200, v33 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v34, 0x200, v34 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v36, 0x200, v36 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v38, 0x200, v38 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB52_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v38
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v36
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v33
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -16460,194 +19080,374 @@ define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v16f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v38, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_4
-; GFX11-NEXT:  .LBB53_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v35
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v38
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v48
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v30
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v49
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v21
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v23
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v25
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v12, v11, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB53_2
-; GFX11-NEXT:  .LBB53_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v20, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v5, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v25, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v19, v3
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v4
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v17, v5
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v15, v0
-; GFX11-NEXT:    v_add_nc_u16 v2, v8, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v39, v1
-; GFX11-NEXT:    v_add_nc_u16 v14, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v13, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v11, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v48, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v38, v4
-; GFX11-NEXT:    v_or_b32_e32 v11, v37, v11
-; GFX11-NEXT:    v_or_b32_e32 v13, v35, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v36, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_perm_b32 v0, v11, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v13, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v15, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v12, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v19.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-TRUE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v20.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v19.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v21.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v13.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v17.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v16.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v14.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v12.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v10.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v11.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v38, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v25
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-FAKE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v30, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v20, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v25, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v19, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v17, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v15, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v8, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v39, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v13, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v11, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v48, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v38, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v37, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v35, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v36, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v13, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v15, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v12, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -17333,241 +20133,493 @@ define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v7, v36
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16bf16_to_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
-; GFX11-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
-; GFX11-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB54_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
-; GFX11-NEXT:  .LBB54_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB54_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v39
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add3_u32 v1, v9, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x400000, v2
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v37
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v0, vcc_lo
-; GFX11-NEXT:    v_dual_add_f32 v7, 0x40c00000, v8 :: v_dual_add_f32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v36
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v7
-; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v12, v3, 16, 1
-; GFX11-NEXT:    v_perm_b32 v1, v5, v4, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v9, v11, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v9, v12, v3, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v13, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v35
-; GFX11-NEXT:    v_perm_b32 v8, v11, v6, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v9, 0x40c00000, v12
-; GFX11-NEXT:    v_add3_u32 v12, v13, v7, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v35
-; GFX11-NEXT:    v_bfe_u32 v15, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v12, v13 :: v_dual_add_f32 v12, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_add3_u32 v13, v15, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX11-NEXT:    v_perm_b32 v7, v7, v3, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
-; GFX11-NEXT:    v_dual_cndmask_b32 v13, v13, v14 :: v_dual_add_f32 v14, 0x40c00000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v16, v17, v12, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
-; GFX11-NEXT:    v_bfe_u32 v15, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_bfe_u32 v19, v14, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v9, v15, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v10
-; GFX11-NEXT:    v_add3_u32 v10, v19, v14, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v20, v9, v15 :: v_dual_add_f32 v9, 0x40c00000, v17
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v33
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v16, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_bfe_u32 v16, v9, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v10, v15 :: v_dual_add_f32 v15, 0x40c00000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v18
-; GFX11-NEXT:    v_add3_u32 v16, v16, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v16, v18, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v16, v19, v15, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v16, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v32
-; GFX11-NEXT:    v_perm_b32 v16, v19, v18, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v21, v22, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v17
-; GFX11-NEXT:    v_bfe_u32 v9, v10, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v20, v13, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v9, v23, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v9, v14, v12, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v10
-; GFX11-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 8, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[27:28], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:  .LBB54_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v38
-; GFX11-NEXT:    v_mov_b32_e32 v4, v39
-; GFX11-NEXT:    v_mov_b32_e32 v8, v36
-; GFX11-NEXT:    v_mov_b32_e32 v12, v37
-; GFX11-NEXT:    v_mov_b32_e32 v16, v34
-; GFX11-NEXT:    v_mov_b32_e32 v20, v35
-; GFX11-NEXT:    v_mov_b32_e32 v24, v32
-; GFX11-NEXT:    v_mov_b32_e32 v28, v33
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16bf16_to_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v27, v7 :: v_dual_mov_b32 v26, v6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v19, v5 :: v_dual_mov_b32 v18, v4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v11, v3 :: v_dual_mov_b32 v10, v2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[32:33], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[2:3]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v27.h
+; GFX11-TRUE16-NEXT:  .LBB54_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v12, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v12, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, v0, v1 :: v_dual_add_f32 v5, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v12.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v9, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v1, v14
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v15, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v8, v13 :: v_dual_add_f32 v1, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v16, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v13, v15, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v17, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v19, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v27
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v16, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v20.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v10, v13 :: v_dual_add_f32 v10, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v16, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v27
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v23, 0x40c00000, v16 :: v_dual_cndmask_b32 v16, v15, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v26
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v5, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v19, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v28.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v25, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v19, 0xffff, v17, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v15, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v27, 0xffff, v10, v30
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v24.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v18, 0xffff, v15, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v21, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v9, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v27
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v26, 0xffff, v13, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[32:33], 24, v[26:27]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[33:34], 24, v[18:19]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[2:3]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  .LBB54_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v18.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v22.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v24.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v26.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v28.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v30.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16bf16_to_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB54_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v33
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v32
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v35
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v34
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[32:33]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[34:35]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[38:39]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; GFX11-FAKE16-NEXT:  .LBB54_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v1, v9, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v37
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v8 :: v_dual_add_f32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v36
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v5, v4, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v12, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v35
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v11, v6, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_add3_u32 v12, v13, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v35
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v12, v13 :: v_dual_add_f32 v12, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v15, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v13, v13, v14 :: v_dual_add_f32 v14, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v17, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v15, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v19, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v9, v15 :: v_dual_add_f32 v9, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v16, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v10, v15 :: v_dual_add_f32 v15, 0x40c00000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v16, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v19, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v32
+; GFX11-FAKE16-NEXT:    v_perm_b32 v16, v19, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v21, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v20, v13, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v9, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v14, v12, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[27:28], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:  .LBB54_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v38
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v39
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v36
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v37
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v33
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -18154,194 +21206,374 @@ define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i8_to_v16bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v35, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v38, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_4
-; GFX11-NEXT:  .LBB55_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB55_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v35
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v38
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v48
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v9
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v24
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v30
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v39
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v49
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v21
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v23
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v25
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v12, v11, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB55_2
-; GFX11-NEXT:  .LBB55_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v20, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v5, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v25, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v21, v2
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v19, v3
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v4
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v17, v5
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v14, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v15, v0
-; GFX11-NEXT:    v_add_nc_u16 v2, v8, 3
-; GFX11-NEXT:    v_or_b32_e32 v1, v39, v1
-; GFX11-NEXT:    v_add_nc_u16 v14, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v13, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v11, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v34, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v2, v48, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v38, v4
-; GFX11-NEXT:    v_or_b32_e32 v11, v37, v11
-; GFX11-NEXT:    v_or_b32_e32 v13, v35, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v36, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_perm_b32 v0, v11, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v13, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v15, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v12, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i8_to_v16bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v32
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-TRUE16-NEXT:  .LBB55_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v19.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-TRUE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v20.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v19.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v18.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v21.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v13.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v14.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v17.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v16.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v14.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v13.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v12.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v9.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v11.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v10.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v11.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v9.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v8.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i8_to_v16bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v35, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v38, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-FAKE16-NEXT:  .LBB55_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v25
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-FAKE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v30, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v20, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v25, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v21, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v19, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v17, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v14, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v15, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v8, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v39, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v13, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v11, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v34, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v48, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v38, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v37, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v35, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v36, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v13, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v15, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v12, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v17, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 467e7740d24cf..6e6e62c4b05ad 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v10i32_to_v10f32:
@@ -219,384 +220,6 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x i16> @bitcast_v10i32_to_v20i16(<10 x i32> %a, i32 %b) {
-; GCN-LABEL: bitcast_v10i32_to_v20i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v18, v9
-; GCN-NEXT:    v_mov_b32_e32 v16, v8
-; GCN-NEXT:    v_mov_b32_e32 v14, v7
-; GCN-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-NEXT:    v_mov_b32_e32 v20, v5
-; GCN-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-NEXT:    v_mov_b32_e32 v6, v3
-; GCN-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    ; implicit-def: $vgpr1
-; GCN-NEXT:    ; implicit-def: $vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr5
-; GCN-NEXT:    ; implicit-def: $vgpr7
-; GCN-NEXT:    ; implicit-def: $vgpr9
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr19
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB2_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v20, v8, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT:  .LBB2_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB2_4
-; GCN-NEXT:  ; %bb.3: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 3, v20
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 3, v14
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 3, v12
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
-; GCN-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v20, v8, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT:  .LBB2_4: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v10, v20
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v10i32_to_v20i16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB2_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_add_u32_e32 v9, vcc, 3, v9
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
-; VI-NEXT:    v_add_u32_e32 v7, vcc, 3, v7
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v6
-; VI-NEXT:    v_add_u32_e32 v5, vcc, 3, v5
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v4
-; VI-NEXT:    v_add_u32_e32 v3, vcc, 3, v3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:  .LBB2_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v10i32_to_v20i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB2_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_add_u32_e32 v9, 3, v9
-; GFX9-NEXT:    v_add_u32_e32 v8, 3, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, 3, v7
-; GFX9-NEXT:    v_add_u32_e32 v6, 3, v6
-; GFX9-NEXT:    v_add_u32_e32 v5, 3, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, 3, v4
-; GFX9-NEXT:    v_add_u32_e32 v3, 3, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, 3, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, 3, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT:  .LBB2_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v10i32_to_v20i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT:  .LBB2_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = add <10 x i32> %a, splat (i32 3)
-  %a2 = bitcast <10 x i32> %a1 to <20 x i16>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <10 x i32> %a to <20 x i16>
-  br label %end
-
-end:
-  %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x i16> %phi
-}
-
-define <10 x i32> @bitcast_v20i16_to_v10i32(<20 x i16> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20i16_to_v10i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v25, v8
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v23, v4
-; GCN-NEXT:    v_mov_b32_e32 v22, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v29, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB3_3
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB3_4
-; GCN-NEXT:  .LBB3_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB3_3: ; %cmp.false
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v21
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v22
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v23
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v24
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v25
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v10
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v12
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v14
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v16
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v18
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v20
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v26
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v27
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v28
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v29
-; GCN-NEXT:    v_or_b32_e32 v5, v5, v11
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v13
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v15
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v17
-; GCN-NEXT:    v_or_b32_e32 v9, v9, v19
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr10
-; GCN-NEXT:    ; implicit-def: $vgpr12
-; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr19
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB3_2
-; GCN-NEXT:  .LBB3_4: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v21
-; GCN-NEXT:    s_mov_b32 s6, 0x30000
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v22
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v23
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v24
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v25
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v10
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v12
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v14
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v16
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v18
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GCN-NEXT:    v_or_b32_e32 v0, v20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v26, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v27, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v28, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v29, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v13, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v15, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v17, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v19, v9
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x30000, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s6, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, s6, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, s6, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x30000, v7
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x30000, v8
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v9
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v20i16_to_v10i32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB3_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v11, 3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v9
-; VI-NEXT:    v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:    v_add_u16_e32 v10, 3, v8
-; VI-NEXT:    v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v8, v10, v8
-; VI-NEXT:    v_add_u16_e32 v10, 3, v7
-; VI-NEXT:    v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v7, v10, v7
-; VI-NEXT:    v_add_u16_e32 v10, 3, v6
-; VI-NEXT:    v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v6, v10, v6
-; VI-NEXT:    v_add_u16_e32 v10, 3, v5
-; VI-NEXT:    v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v5, v10, v5
-; VI-NEXT:    v_add_u16_e32 v10, 3, v4
-; VI-NEXT:    v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v4, v10, v4
-; VI-NEXT:    v_add_u16_e32 v10, 3, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v3, v10, v3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v2
-; VI-NEXT:    v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v10, v2
-; VI-NEXT:    v_add_u16_e32 v10, 3, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v10, v1
-; VI-NEXT:    v_add_u16_e32 v10, 3, v0
-; VI-NEXT:    v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v0, v10, v0
-; VI-NEXT:  .LBB3_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v20i16_to_v10i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB3_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB3_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v20i16_to_v10i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB3_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:  .LBB3_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = add <20 x i16> %a, splat (i16 3)
-  %a2 = bitcast <20 x i16> %a1 to <10 x i32>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <20 x i16> %a to <10 x i32>
-  br label %end
-
-end:
-  %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <10 x i32> %phi
-}
-
 define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v10i32_to_v20f16:
 ; GCN:       ; %bb.0:
@@ -634,14 +257,14 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB4_3
+; GCN-NEXT:    s_cbranch_execnz .LBB2_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB4_4
-; GCN-NEXT:  .LBB4_2: ; %end
+; GCN-NEXT:    s_cbranch_execnz .LBB2_4
+; GCN-NEXT:  .LBB2_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB4_3: ; %cmp.false
+; GCN-NEXT:  .LBB2_3: ; %cmp.false
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v27
@@ -683,8 +306,8 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr28
 ; GCN-NEXT:    ; implicit-def: $vgpr29
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB4_2
-; GCN-NEXT:  .LBB4_4: ; %cmp.true
+; GCN-NEXT:    s_cbranch_execz .LBB2_2
+; GCN-NEXT:  .LBB2_4: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v20
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v21
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v22
@@ -735,7 +358,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB4_2
+; VI-NEXT:    s_cbranch_execz .LBB2_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 3, v9
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
@@ -747,7 +370,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:  .LBB4_2: ; %end
+; VI-NEXT:  .LBB2_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -758,7 +381,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_u32_e32 v9, 3, v9
 ; GFX9-NEXT:    v_add_u32_e32 v8, 3, v8
@@ -770,7 +393,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT:  .LBB4_2: ; %end
+; GFX9-NEXT:  .LBB2_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -782,7 +405,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB4_2
+; GFX11-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
 ; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
@@ -794,7 +417,7 @@ define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v1
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT:  .LBB4_2: ; %end
+; GFX11-NEXT:  .LBB2_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -842,14 +465,14 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB5_3
+; GCN-NEXT:    s_cbranch_execnz .LBB3_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB5_4
-; GCN-NEXT:  .LBB5_2: ; %end
+; GCN-NEXT:    s_cbranch_execnz .LBB3_4
+; GCN-NEXT:  .LBB3_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB5_3: ; %cmp.false
+; GCN-NEXT:  .LBB3_3: ; %cmp.false
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v33
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
 ; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
@@ -891,8 +514,8 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr13
 ; GCN-NEXT:    ; implicit-def: $vgpr10
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB5_2
-; GCN-NEXT:  .LBB5_4: ; %cmp.true
+; GCN-NEXT:    s_cbranch_execz .LBB3_2
+; GCN-NEXT:  .LBB3_4: ; %cmp.true
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v33
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v29
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v32
@@ -983,7 +606,7 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB5_2
+; VI-NEXT:    s_cbranch_execz .LBB3_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x200
 ; VI-NEXT:    v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1016,7 +639,7 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_f16_e32 v0, 0x200, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v11
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v10
-; VI-NEXT:  .LBB5_2: ; %end
+; VI-NEXT:  .LBB3_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1027,7 +650,7 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    s_movk_i32 s6, 0x200
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
@@ -1040,7 +663,7 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB5_2: ; %end
+; GFX9-NEXT:  .LBB3_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1052,7 +675,7 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB5_2
+; GFX11-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
@@ -1064,7 +687,7 @@ define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) {
 ; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:  .LBB5_2: ; %end
+; GFX11-NEXT:  .LBB3_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -1121,7 +744,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr16
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB6_2
+; GCN-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_alignbit_b32 v11, v10, v9, 24
 ; GCN-NEXT:    v_alignbit_b32 v12, v10, v9, 16
@@ -1153,9 +776,9 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB6_2: ; %Flow
+; GCN-NEXT:  .LBB4_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB6_4
+; GCN-NEXT:    s_cbranch_execz .LBB4_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
@@ -1197,7 +820,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB6_4: ; %end
+; GCN-NEXT:  .LBB4_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v49, 0xff, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
@@ -1347,7 +970,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB6_2
+; VI-NEXT:    s_cbranch_execz .LBB4_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -1379,9 +1002,9 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB6_2: ; %Flow
+; VI-NEXT:  .LBB4_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB6_4
+; VI-NEXT:    s_cbranch_execz .LBB4_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 3, v10
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 3, v9
@@ -1423,7 +1046,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB6_4: ; %end
+; VI-NEXT:  .LBB4_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; VI-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -1533,7 +1156,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr11
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB6_2
+; GFX9-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -1565,9 +1188,9 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB6_2: ; %Flow
+; GFX9-NEXT:  .LBB4_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB6_4
+; GFX9-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_u32_e32 v10, 3, v10
 ; GFX9-NEXT:    v_add_u32_e32 v9, 3, v9
@@ -1609,7 +1232,7 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB6_4: ; %end
+; GFX9-NEXT:  .LBB4_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -1674,217 +1297,401 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v10i32_to_v40i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB6_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB6_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB6_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB6_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v16
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v31, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v39
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v16
-; GFX11-NEXT:    v_or_b32_e32 v15, v48, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v14, v35, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v31
-; GFX11-NEXT:    v_or_b32_e32 v13, v30, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_or_b32_e32 v16, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v29
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v27
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v26
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v32
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v34
-; GFX11-NEXT:    v_or_b32_e32 v32, v33, v32
-; GFX11-NEXT:    v_or_b32_e32 v12, v25, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v16
-; GFX11-NEXT:    v_or_b32_e32 v11, v20, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v30
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v15
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v10i32_to_v40i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB4_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB4_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB4_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB4_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB4_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB4_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB4_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB4_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v31, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v48, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v35, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v38, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v33, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v25, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v20, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v15
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1950,7 +1757,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB7_2
+; GCN-NEXT:    s_cbranch_execz .LBB5_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v31
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v32
@@ -2062,9 +1869,9 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr23
 ; GCN-NEXT:    ; implicit-def: $vgpr53
 ; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:  .LBB7_2: ; %Flow
+; GCN-NEXT:  .LBB5_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB7_4
+; GCN-NEXT:    s_cbranch_execz .LBB5_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v31
 ; GCN-NEXT:    s_movk_i32 s6, 0x300
@@ -2178,7 +1985,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x3000000, v8
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v9
-; GCN-NEXT:  .LBB7_4: ; %end
+; GCN-NEXT:  .LBB5_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -2238,7 +2045,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_lshlrev_b16_e32 v13, 8, v44
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB7_2
+; VI-NEXT:    s_cbranch_execz .LBB5_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2311,9 +2118,9 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr15
 ; VI-NEXT:    ; implicit-def: $vgpr13
 ; VI-NEXT:    ; implicit-def: $vgpr11
-; VI-NEXT:  .LBB7_2: ; %Flow
+; VI-NEXT:  .LBB5_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB7_4
+; VI-NEXT:    s_cbranch_execz .LBB5_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v31
 ; VI-NEXT:    v_add_u16_e32 v1, 3, v32
@@ -2387,7 +2194,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v10, 0x300, v10
 ; VI-NEXT:    v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:  .LBB7_4: ; %end
+; VI-NEXT:  .LBB5_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -2450,7 +2257,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v44
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB7_2
+; GFX9-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2523,9 +2330,9 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr15
 ; GFX9-NEXT:    ; implicit-def: $vgpr13
 ; GFX9-NEXT:    ; implicit-def: $vgpr11
-; GFX9-NEXT:  .LBB7_2: ; %Flow
+; GFX9-NEXT:  .LBB5_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB7_4
+; GFX9-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_u16_e32 v0, 3, v31
 ; GFX9-NEXT:    v_add_u16_e32 v1, 3, v32
@@ -2599,7 +2406,7 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v9, 0x300, v9
 ; GFX9-NEXT:    v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX9-NEXT:  .LBB7_4: ; %end
+; GFX9-NEXT:  .LBB5_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -2609,290 +2416,584 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v40i8_to_v10i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6
-; GFX11-NEXT:    v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2
-; GFX11-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-NEXT:    s_clause 0x9
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v37, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_u16 v38, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v66
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
-; GFX11-NEXT:  .LBB7_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB7_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v53
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v64
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v65
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v48
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v49
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v50
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v51
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v52
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v23
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v25
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v27
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v29
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB7_2
-; GFX11-NEXT:  .LBB7_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v18, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v53, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v54, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v55, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v64, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v65, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v48, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v49, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v50, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v51, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v52, v9
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_add_nc_u16 v5, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v12, v39, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v36, 3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v5, v21, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v23, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v25, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v27, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v29, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v13, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v15, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v17, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v19, v18
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x9
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB5_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX11-TRUE16-NEXT:  .LBB5_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB5_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB5_2
+; GFX11-TRUE16-NEXT:  .LBB5_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v26.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v25.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v25.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v23.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v22.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v23.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v24.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v15.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v16.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v18.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v13.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v14.h, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v13.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v14.l, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v10.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v11.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x9
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v36, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v37, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v38, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB5_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX11-FAKE16-NEXT:  .LBB5_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB5_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB5_2
+; GFX11-FAKE16-NEXT:  .LBB5_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v18, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v53, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v54, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v55, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v64, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v65, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v48, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v49, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v50, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v51, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v52, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v39, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v36, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v21, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v25, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v27, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v29, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v13, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v15, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v17, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v19, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2918,7 +3019,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB8_2
+; GCN-NEXT:    s_cbranch_execz .LBB6_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
@@ -2930,7 +3031,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; GCN-NEXT:  .LBB8_2: ; %end
+; GCN-NEXT:  .LBB6_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2941,7 +3042,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB8_2
+; VI-NEXT:    s_cbranch_execz .LBB6_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 3, v9
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
@@ -2953,7 +3054,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:  .LBB8_2: ; %end
+; VI-NEXT:  .LBB6_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2964,7 +3065,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_u32_e32 v9, 3, v9
 ; GFX9-NEXT:    v_add_u32_e32 v8, 3, v8
@@ -2976,7 +3077,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT:  .LBB8_2: ; %end
+; GFX9-NEXT:  .LBB6_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2988,7 +3089,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB8_2
+; GFX11-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
 ; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
@@ -3000,7 +3101,7 @@ define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v1
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT:  .LBB8_2: ; %end
+; GFX11-NEXT:  .LBB6_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -3028,14 +3129,14 @@ define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB9_2
+; GCN-NEXT:    s_cbranch_execz .LBB7_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; GCN-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GCN-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GCN-NEXT:  .LBB9_2: ; %end
+; GCN-NEXT:  .LBB7_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3046,14 +3147,14 @@ define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB9_2
+; VI-NEXT:    s_cbranch_execz .LBB7_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; VI-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; VI-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; VI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; VI-NEXT:  .LBB9_2: ; %end
+; VI-NEXT:  .LBB7_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3064,14 +3165,14 @@ define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:  .LBB9_2: ; %end
+; GFX9-NEXT:  .LBB7_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3083,14 +3184,14 @@ define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB9_2
+; GFX11-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:  .LBB9_2: ; %end
+; GFX11-NEXT:  .LBB7_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -3118,7 +3219,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB10_2
+; GCN-NEXT:    s_cbranch_execz .LBB8_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
@@ -3130,7 +3231,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; GCN-NEXT:  .LBB10_2: ; %end
+; GCN-NEXT:  .LBB8_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3141,7 +3242,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB10_2
+; VI-NEXT:    s_cbranch_execz .LBB8_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 3, v9
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
@@ -3153,7 +3254,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:  .LBB10_2: ; %end
+; VI-NEXT:  .LBB8_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3164,7 +3265,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB10_2
+; GFX9-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_u32_e32 v9, 3, v9
 ; GFX9-NEXT:    v_add_u32_e32 v8, 3, v8
@@ -3176,7 +3277,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u32_e32 v2, 3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, 3, v0
-; GFX9-NEXT:  .LBB10_2: ; %end
+; GFX9-NEXT:  .LBB8_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3188,7 +3289,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB10_2
+; GFX11-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
 ; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
@@ -3200,7 +3301,7 @@ define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v1
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT:  .LBB10_2: ; %end
+; GFX11-NEXT:  .LBB8_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -3228,7 +3329,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB11_2
+; GCN-NEXT:    s_cbranch_execz .LBB9_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
@@ -3240,7 +3341,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:  .LBB11_2: ; %end
+; GCN-NEXT:  .LBB9_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3251,7 +3352,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB11_2
+; VI-NEXT:    s_cbranch_execz .LBB9_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
@@ -3263,7 +3364,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:  .LBB11_2: ; %end
+; VI-NEXT:  .LBB9_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3274,7 +3375,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB11_2
+; GFX9-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 3, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
@@ -3286,7 +3387,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:  .LBB11_2: ; %end
+; GFX9-NEXT:  .LBB9_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3298,7 +3399,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -3313,7 +3414,7 @@ define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:  .LBB11_2: ; %end
+; GFX11-NEXT:  .LBB9_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -3333,81 +3434,145 @@ end:
   ret <10 x i32> %phi
 }
 
-define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) {
-; GCN-LABEL: bitcast_v10f32_to_v20i16:
+define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) {
+; GCN-LABEL: bitcast_v10f32_to_v20f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v18, v9
-; GCN-NEXT:    v_mov_b32_e32 v16, v8
-; GCN-NEXT:    v_mov_b32_e32 v14, v7
-; GCN-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-NEXT:    v_mov_b32_e32 v20, v5
-; GCN-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-NEXT:    v_mov_b32_e32 v6, v3
-; GCN-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
+; GCN-NEXT:    v_mov_b32_e32 v29, v9
+; GCN-NEXT:    v_mov_b32_e32 v28, v8
+; GCN-NEXT:    v_mov_b32_e32 v27, v7
+; GCN-NEXT:    v_mov_b32_e32 v26, v6
+; GCN-NEXT:    v_mov_b32_e32 v25, v5
+; GCN-NEXT:    v_mov_b32_e32 v24, v4
+; GCN-NEXT:    v_mov_b32_e32 v23, v3
+; GCN-NEXT:    v_mov_b32_e32 v22, v2
+; GCN-NEXT:    v_mov_b32_e32 v21, v1
+; GCN-NEXT:    v_mov_b32_e32 v20, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GCN-NEXT:    ; implicit-def: $vgpr0
 ; GCN-NEXT:    ; implicit-def: $vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr2
 ; GCN-NEXT:    ; implicit-def: $vgpr3
+; GCN-NEXT:    ; implicit-def: $vgpr4
 ; GCN-NEXT:    ; implicit-def: $vgpr5
+; GCN-NEXT:    ; implicit-def: $vgpr6
 ; GCN-NEXT:    ; implicit-def: $vgpr7
+; GCN-NEXT:    ; implicit-def: $vgpr8
 ; GCN-NEXT:    ; implicit-def: $vgpr9
+; GCN-NEXT:    ; implicit-def: $vgpr10
 ; GCN-NEXT:    ; implicit-def: $vgpr11
+; GCN-NEXT:    ; implicit-def: $vgpr12
 ; GCN-NEXT:    ; implicit-def: $vgpr13
+; GCN-NEXT:    ; implicit-def: $vgpr14
 ; GCN-NEXT:    ; implicit-def: $vgpr15
+; GCN-NEXT:    ; implicit-def: $vgpr16
 ; GCN-NEXT:    ; implicit-def: $vgpr17
+; GCN-NEXT:    ; implicit-def: $vgpr18
 ; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB12_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v20, v8, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT:  .LBB12_2: ; %Flow
+; GCN-NEXT:    s_cbranch_execnz .LBB10_3
+; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB12_4
-; GCN-NEXT:  ; %bb.3: ; %cmp.true
-; GCN-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; GCN-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; GCN-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; GCN-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; GCN-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; GCN-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v20, v8, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT:  .LBB12_4: ; %end
+; GCN-NEXT:    s_cbranch_execnz .LBB10_4
+; GCN-NEXT:  .LBB10_2: ; %end
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:  .LBB10_3: ; %cmp.false
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v27
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
+; GCN-NEXT:    v_lshrrev_b32_e32 v32, 16, v21
+; GCN-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v29
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v28
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v27
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v26
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v25
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v24
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v23
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v22
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v21
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v0
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v30
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v33
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v20
+; GCN-NEXT:    ; implicit-def: $vgpr20
+; GCN-NEXT:    ; implicit-def: $vgpr21
+; GCN-NEXT:    ; implicit-def: $vgpr22
+; GCN-NEXT:    ; implicit-def: $vgpr23
+; GCN-NEXT:    ; implicit-def: $vgpr24
+; GCN-NEXT:    ; implicit-def: $vgpr25
+; GCN-NEXT:    ; implicit-def: $vgpr26
+; GCN-NEXT:    ; implicit-def: $vgpr27
+; GCN-NEXT:    ; implicit-def: $vgpr28
+; GCN-NEXT:    ; implicit-def: $vgpr29
+; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB10_2
+; GCN-NEXT:  .LBB10_4: ; %cmp.true
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v20
+; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v21
+; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v22
+; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v23
+; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v24
+; GCN-NEXT:    v_add_f32_e32 v11, 1.0, v25
+; GCN-NEXT:    v_add_f32_e32 v13, 1.0, v26
+; GCN-NEXT:    v_add_f32_e32 v15, 1.0, v27
+; GCN-NEXT:    v_add_f32_e32 v17, 1.0, v28
+; GCN-NEXT:    v_add_f32_e32 v19, 1.0, v29
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v19
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v15
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v10, v20
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v10f32_to_v20i16:
+; VI-LABEL: bitcast_v10f32_to_v20f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB12_2
+; VI-NEXT:    s_cbranch_execz .LBB10_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
 ; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
@@ -3419,18 +3584,18 @@ define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) {
 ; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; VI-NEXT:  .LBB12_2: ; %end
+; VI-NEXT:  .LBB10_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v10f32_to_v20i16:
+; GFX9-LABEL: bitcast_v10f32_to_v20f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB12_2
+; GFX9-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
 ; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
@@ -3442,11 +3607,11 @@ define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:  .LBB12_2: ; %end
+; GFX9-NEXT:  .LBB10_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v10f32_to_v20i16:
+; GFX11-LABEL: bitcast_v10f32_to_v20f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
@@ -3468,204 +3633,249 @@ define <20 x i16> @bitcast_v10f32_to_v20i16(<10 x float> %a, i32 %b) {
 
 cmp.true:
   %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
-  %a2 = bitcast <10 x float> %a1 to <20 x i16>
+  %a2 = bitcast <10 x float> %a1 to <20 x half>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <10 x float> %a to <20 x i16>
+  %a3 = bitcast <10 x float> %a to <20 x half>
   br label %end
 
 end:
-  %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x i16> %phi
+  %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <20 x half> %phi
 }
 
-define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20i16_to_v10f32:
+define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) {
+; GCN-LABEL: bitcast_v20f16_to_v10f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v25, v8
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v23, v4
-; GCN-NEXT:    v_mov_b32_e32 v22, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v0
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v29, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v14
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v17
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v19
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v18
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB13_3
+; GCN-NEXT:    s_cbranch_execnz .LBB11_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB13_4
-; GCN-NEXT:  .LBB13_2: ; %end
+; GCN-NEXT:    s_cbranch_execnz .LBB11_4
+; GCN-NEXT:  .LBB11_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB13_3: ; %cmp.false
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v21
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v22
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v23
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v24
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v25
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v10
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v12
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v14
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v16
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v18
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v20
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v26
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v27
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v28
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v29
-; GCN-NEXT:    v_or_b32_e32 v5, v5, v11
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v13
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v15
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v17
-; GCN-NEXT:    v_or_b32_e32 v9, v9, v19
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr24
+; GCN-NEXT:  .LBB11_3: ; %cmp.false
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v33
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v28
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v26
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
+; GCN-NEXT:    v_or_b32_e32 v0, v29, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v27, v1
+; GCN-NEXT:    v_or_b32_e32 v2, v25, v2
+; GCN-NEXT:    v_or_b32_e32 v3, v23, v3
+; GCN-NEXT:    v_or_b32_e32 v4, v22, v4
+; GCN-NEXT:    v_or_b32_e32 v5, v21, v5
+; GCN-NEXT:    v_or_b32_e32 v6, v20, v6
+; GCN-NEXT:    v_or_b32_e32 v7, v12, v7
+; GCN-NEXT:    v_or_b32_e32 v8, v11, v8
+; GCN-NEXT:    v_or_b32_e32 v9, v10, v9
+; GCN-NEXT:    ; implicit-def: $vgpr33
+; GCN-NEXT:    ; implicit-def: $vgpr29
+; GCN-NEXT:    ; implicit-def: $vgpr32
+; GCN-NEXT:    ; implicit-def: $vgpr27
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr10
+; GCN-NEXT:    ; implicit-def: $vgpr30
+; GCN-NEXT:    ; implicit-def: $vgpr23
+; GCN-NEXT:    ; implicit-def: $vgpr28
+; GCN-NEXT:    ; implicit-def: $vgpr22
+; GCN-NEXT:    ; implicit-def: $vgpr26
+; GCN-NEXT:    ; implicit-def: $vgpr21
+; GCN-NEXT:    ; implicit-def: $vgpr24
+; GCN-NEXT:    ; implicit-def: $vgpr20
+; GCN-NEXT:    ; implicit-def: $vgpr15
 ; GCN-NEXT:    ; implicit-def: $vgpr12
 ; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr29
 ; GCN-NEXT:    ; implicit-def: $vgpr11
 ; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr19
+; GCN-NEXT:    ; implicit-def: $vgpr10
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB13_2
-; GCN-NEXT:  .LBB13_4: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v21
-; GCN-NEXT:    s_mov_b32 s6, 0x30000
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v22
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v23
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v24
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v25
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v10
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v12
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v14
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v16
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v18
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GCN-NEXT:    v_or_b32_e32 v0, v20, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v26, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v27, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v28, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v29, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v13, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v15, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v17, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v19, v9
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x30000, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s6, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, s6, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, s6, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x30000, v7
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x30000, v8
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v9
+; GCN-NEXT:    s_cbranch_execz .LBB11_2
+; GCN-NEXT:  .LBB11_4: ; %cmp.true
+; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v33
+; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v29
+; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v32
+; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v27
+; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v31
+; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v25
+; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v30
+; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v23
+; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v28
+; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v22
+; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
+; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v21
+; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v24
+; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v20
+; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GCN-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; GCN-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; GCN-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; GCN-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; GCN-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; GCN-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; GCN-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; GCN-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; GCN-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; GCN-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; GCN-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; GCN-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; GCN-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; GCN-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; GCN-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
+; GCN-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; GCN-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; GCN-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; GCN-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
+; GCN-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
+; GCN-NEXT:    v_or_b32_e32 v5, v17, v16
+; GCN-NEXT:    v_or_b32_e32 v6, v19, v18
+; GCN-NEXT:    v_or_b32_e32 v7, v12, v15
+; GCN-NEXT:    v_or_b32_e32 v8, v11, v14
+; GCN-NEXT:    v_or_b32_e32 v9, v10, v13
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v20i16_to_v10f32:
+; VI-LABEL: bitcast_v20f16_to_v10f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB13_2
+; VI-NEXT:    s_cbranch_execz .LBB11_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v11, 3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v9
-; VI-NEXT:    v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:    v_add_u16_e32 v10, 3, v8
-; VI-NEXT:    v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v8, v10, v8
-; VI-NEXT:    v_add_u16_e32 v10, 3, v7
-; VI-NEXT:    v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v7, v10, v7
-; VI-NEXT:    v_add_u16_e32 v10, 3, v6
-; VI-NEXT:    v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v6, v10, v6
-; VI-NEXT:    v_add_u16_e32 v10, 3, v5
-; VI-NEXT:    v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v5, v10, v5
-; VI-NEXT:    v_add_u16_e32 v10, 3, v4
-; VI-NEXT:    v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v4, v10, v4
-; VI-NEXT:    v_add_u16_e32 v10, 3, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v3, v10, v3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v2
-; VI-NEXT:    v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v10, v2
-; VI-NEXT:    v_add_u16_e32 v10, 3, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v10, v1
-; VI-NEXT:    v_add_u16_e32 v10, 3, v0
-; VI-NEXT:    v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v0, v10, v0
-; VI-NEXT:  .LBB13_2: ; %end
+; VI-NEXT:    v_mov_b32_e32 v10, 0x200
+; VI-NEXT:    v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
+; VI-NEXT:    v_or_b32_e32 v9, v9, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v8, 0x200, v8
+; VI-NEXT:    v_or_b32_e32 v8, v8, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
+; VI-NEXT:    v_or_b32_e32 v7, v7, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
+; VI-NEXT:    v_or_b32_e32 v6, v6, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v5, 0x200, v5
+; VI-NEXT:    v_or_b32_e32 v5, v5, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v4, 0x200, v4
+; VI-NEXT:    v_or_b32_e32 v4, v4, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
+; VI-NEXT:    v_or_b32_e32 v3, v3, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
+; VI-NEXT:    v_or_b32_e32 v2, v2, v11
+; VI-NEXT:    v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
+; VI-NEXT:    v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v0, 0x200, v0
+; VI-NEXT:    v_or_b32_e32 v1, v1, v11
+; VI-NEXT:    v_or_b32_e32 v0, v0, v10
+; VI-NEXT:  .LBB11_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v20i16_to_v10f32:
+; GFX9-LABEL: bitcast_v20f16_to_v10f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB13_2
+; GFX9-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB13_2: ; %end
+; GFX9-NEXT:    s_movk_i32 s6, 0x200
+; GFX9-NEXT:    v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
+; GFX9-NEXT:  .LBB11_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v20i16_to_v10f32:
+; GFX11-LABEL: bitcast_v20f16_to_v10f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
@@ -3673,31 +3883,31 @@ define <10 x float> @bitcast_v20i16_to_v10f32(<20 x i16> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:  .LBB13_2: ; %end
+; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-NEXT:  .LBB11_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
 cmp.true:
-  %a1 = add <20 x i16> %a, splat (i16 3)
-  %a2 = bitcast <20 x i16> %a1 to <10 x float>
+  %a1 = fadd <20 x half> %a, splat (half 0xH0200)
+  %a2 = bitcast <20 x half> %a1 to <10 x float>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <20 x i16> %a to <10 x float>
+  %a3 = bitcast <20 x half> %a to <10 x float>
   br label %end
 
 end:
@@ -3705,3797 +3915,420 @@ end:
   ret <10 x float> %phi
 }
 
-define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) {
-; GCN-LABEL: bitcast_v10f32_to_v20f16:
+define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
+; GCN-LABEL: bitcast_v10f32_to_v40i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v29, v9
-; GCN-NEXT:    v_mov_b32_e32 v28, v8
-; GCN-NEXT:    v_mov_b32_e32 v27, v7
-; GCN-NEXT:    v_mov_b32_e32 v26, v6
-; GCN-NEXT:    v_mov_b32_e32 v25, v5
-; GCN-NEXT:    v_mov_b32_e32 v24, v4
-; GCN-NEXT:    v_mov_b32_e32 v23, v3
-; GCN-NEXT:    v_mov_b32_e32 v22, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v1
-; GCN-NEXT:    v_mov_b32_e32 v20, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr1
-; GCN-NEXT:    ; implicit-def: $vgpr2
-; GCN-NEXT:    ; implicit-def: $vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr4
-; GCN-NEXT:    ; implicit-def: $vgpr5
-; GCN-NEXT:    ; implicit-def: $vgpr6
-; GCN-NEXT:    ; implicit-def: $vgpr7
-; GCN-NEXT:    ; implicit-def: $vgpr8
-; GCN-NEXT:    ; implicit-def: $vgpr9
-; GCN-NEXT:    ; implicit-def: $vgpr10
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr12
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr19
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB14_3
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB14_4
-; GCN-NEXT:  .LBB14_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB14_3: ; %cmp.false
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v27
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v26
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
-; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v23
-; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v22
-; GCN-NEXT:    v_lshrrev_b32_e32 v32, 16, v21
-; GCN-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v28
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v27
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v26
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v25
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v24
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v23
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v22
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v21
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v30
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v33
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v20
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB14_2
-; GCN-NEXT:  .LBB14_4: ; %cmp.true
-; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v20
-; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v21
-; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v22
-; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v23
-; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v24
-; GCN-NEXT:    v_add_f32_e32 v11, 1.0, v25
-; GCN-NEXT:    v_add_f32_e32 v13, 1.0, v26
-; GCN-NEXT:    v_add_f32_e32 v15, 1.0, v27
-; GCN-NEXT:    v_add_f32_e32 v17, 1.0, v28
-; GCN-NEXT:    v_add_f32_e32 v19, 1.0, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v19
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v17
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v15
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v10f32_to_v20f16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB14_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; VI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; VI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; VI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; VI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; VI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; VI-NEXT:  .LBB14_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v10f32_to_v20f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB14_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GFX9-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GFX9-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; GFX9-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:  .LBB14_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v10f32_to_v20f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
-  %a2 = bitcast <10 x float> %a1 to <20 x half>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <10 x float> %a to <20 x half>
-  br label %end
-
-end:
-  %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x half> %phi
-}
-
-define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20f16_to_v10f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v31, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v25, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v30, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v26, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v24, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v19
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v18
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB15_3
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB15_4
-; GCN-NEXT:  .LBB15_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB15_3: ; %cmp.false
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v33
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v28
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v26
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v24
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
-; GCN-NEXT:    v_or_b32_e32 v0, v29, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v27, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v25, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v23, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v22, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v21, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v20, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v12, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v11, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v10, v9
-; GCN-NEXT:    ; implicit-def: $vgpr33
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr12
-; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr10
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB15_2
-; GCN-NEXT:  .LBB15_4: ; %cmp.true
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v33
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v27
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v25
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v30
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v23
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v28
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v22
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v26
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v21
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v24
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v20
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
-; GCN-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; GCN-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; GCN-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; GCN-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; GCN-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; GCN-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; GCN-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; GCN-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; GCN-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; GCN-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; GCN-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; GCN-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; GCN-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; GCN-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; GCN-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; GCN-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; GCN-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; GCN-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; GCN-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
-; GCN-NEXT:    v_or_b32_e32 v5, v17, v16
-; GCN-NEXT:    v_or_b32_e32 v6, v19, v18
-; GCN-NEXT:    v_or_b32_e32 v7, v12, v15
-; GCN-NEXT:    v_or_b32_e32 v8, v11, v14
-; GCN-NEXT:    v_or_b32_e32 v9, v10, v13
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v20f16_to_v10f32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB15_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v10, 0x200
-; VI-NEXT:    v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
-; VI-NEXT:    v_or_b32_e32 v9, v9, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v8, 0x200, v8
-; VI-NEXT:    v_or_b32_e32 v8, v8, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v7, 0x200, v7
-; VI-NEXT:    v_or_b32_e32 v7, v7, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v6, 0x200, v6
-; VI-NEXT:    v_or_b32_e32 v6, v6, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v5, 0x200, v5
-; VI-NEXT:    v_or_b32_e32 v5, v5, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v4, 0x200, v4
-; VI-NEXT:    v_or_b32_e32 v4, v4, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v3, 0x200, v3
-; VI-NEXT:    v_or_b32_e32 v3, v3, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v2, 0x200, v2
-; VI-NEXT:    v_or_b32_e32 v2, v2, v11
-; VI-NEXT:    v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v1, 0x200, v1
-; VI-NEXT:    v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v0, 0x200, v0
-; VI-NEXT:    v_or_b32_e32 v1, v1, v11
-; VI-NEXT:    v_or_b32_e32 v0, v0, v10
-; VI-NEXT:  .LBB15_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v20f16_to_v10f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB15_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    s_movk_i32 s6, 0x200
-; GFX9-NEXT:    v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB15_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v20f16_to_v10f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB15_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:  .LBB15_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = fadd <20 x half> %a, splat (half 0xH0200)
-  %a2 = bitcast <20 x half> %a1 to <10 x float>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <20 x half> %a to <10 x float>
-  br label %end
-
-end:
-  %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <10 x float> %phi
-}
-
-define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
-; GCN-LABEL: bitcast_v10f32_to_v40i8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GCN-NEXT:    ; implicit-def: $vgpr35
-; GCN-NEXT:    ; implicit-def: $vgpr33
-; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr48
-; GCN-NEXT:    ; implicit-def: $vgpr39
-; GCN-NEXT:    ; implicit-def: $vgpr38
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr37
-; GCN-NEXT:    ; implicit-def: $vgpr36
-; GCN-NEXT:    ; implicit-def: $vgpr34
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr12
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr19
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB16_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_alignbit_b32 v11, v10, v9, 24
-; GCN-NEXT:    v_alignbit_b32 v12, v10, v9, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v10, v9, 8
-; GCN-NEXT:    v_alignbit_b32 v14, v8, v7, 24
-; GCN-NEXT:    v_alignbit_b32 v15, v8, v7, 16
-; GCN-NEXT:    v_alignbit_b32 v17, v8, v7, 8
-; GCN-NEXT:    v_alignbit_b32 v20, v6, v5, 24
-; GCN-NEXT:    v_alignbit_b32 v21, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v23, v6, v5, 8
-; GCN-NEXT:    v_alignbit_b32 v26, v4, v3, 24
-; GCN-NEXT:    v_alignbit_b32 v27, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v29, v4, v3, 8
-; GCN-NEXT:    v_alignbit_b32 v32, v2, v1, 24
-; GCN-NEXT:    v_alignbit_b32 v33, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v35, v2, v1, 8
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 24, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v25, 8, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v28, 24, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v34, 24, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v37, 8, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB16_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB16_4
-; GCN-NEXT:  ; %bb.3: ; %cmp.true
-; GCN-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_alignbit_b32 v11, v10, v9, 24
-; GCN-NEXT:    v_alignbit_b32 v12, v10, v9, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v10, v9, 8
-; GCN-NEXT:    v_alignbit_b32 v14, v8, v7, 24
-; GCN-NEXT:    v_alignbit_b32 v15, v8, v7, 16
-; GCN-NEXT:    v_alignbit_b32 v17, v8, v7, 8
-; GCN-NEXT:    v_alignbit_b32 v20, v6, v5, 24
-; GCN-NEXT:    v_alignbit_b32 v21, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v23, v6, v5, 8
-; GCN-NEXT:    v_alignbit_b32 v26, v4, v3, 24
-; GCN-NEXT:    v_alignbit_b32 v27, v4, v3, 16
-; GCN-NEXT:    v_alignbit_b32 v29, v4, v3, 8
-; GCN-NEXT:    v_alignbit_b32 v32, v2, v1, 24
-; GCN-NEXT:    v_alignbit_b32 v33, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v35, v2, v1, 8
-; GCN-NEXT:    v_lshrrev_b32_e32 v16, 24, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v25, 8, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v28, 24, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v34, 24, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v37, 8, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB16_4: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v49, 0xff, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
-; GCN-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GCN-NEXT:    v_lshlrev_b32_e32 v32, 24, v32
-; GCN-NEXT:    v_and_b32_e32 v50, 0xff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v48, 8, v48
-; GCN-NEXT:    v_and_b32_e32 v39, 0xff, v39
-; GCN-NEXT:    v_lshlrev_b32_e32 v38, 24, v38
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; GCN-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GCN-NEXT:    v_lshlrev_b32_e32 v26, 24, v26
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v37, 8, v37
-; GCN-NEXT:    v_and_b32_e32 v36, 0xff, v36
-; GCN-NEXT:    v_lshlrev_b32_e32 v34, 24, v34
-; GCN-NEXT:    v_add_i32_e32 v51, vcc, 12, v0
-; GCN-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; GCN-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 24, v20
-; GCN-NEXT:    v_or_b32_e32 v35, v49, v35
-; GCN-NEXT:    v_add_i32_e32 v49, vcc, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v31, 8, v31
-; GCN-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 24, v28
-; GCN-NEXT:    v_or_b32_e32 v48, v50, v48
-; GCN-NEXT:    v_add_i32_e32 v50, vcc, 20, v0
-; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; GCN-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v29
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 24, v0
-; GCN-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
-; GCN-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GCN-NEXT:    v_lshlrev_b32_e32 v22, 24, v22
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v37
-; GCN-NEXT:    v_add_i32_e32 v37, vcc, 28, v0
-; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GCN-NEXT:    v_or_b32_e32 v5, v5, v23
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 32, v0
-; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
-; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v31
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 36, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
-; GCN-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GCN-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; GCN-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v17
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v25
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v24
-; GCN-NEXT:    v_or_b32_e32 v9, v9, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v19
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v35
-; GCN-NEXT:    v_or_b32_e32 v19, v32, v33
-; GCN-NEXT:    v_and_b32_e32 v24, 0xffff, v48
-; GCN-NEXT:    v_or_b32_e32 v25, v38, v39
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT:    v_or_b32_e32 v26, v26, v27
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_or_b32_e32 v27, v34, v36
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GCN-NEXT:    v_or_b32_e32 v20, v20, v21
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_or_b32_e32 v21, v28, v30
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v15
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_or_b32_e32 v15, v22, v17
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_or_b32_e32 v12, v16, v13
-; GCN-NEXT:    v_or_b32_e32 v13, v18, v19
-; GCN-NEXT:    v_or_b32_e32 v16, v24, v25
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v26
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v27
-; GCN-NEXT:    v_or_b32_e32 v5, v5, v20
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v21
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v14
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v15
-; GCN-NEXT:    v_or_b32_e32 v9, v9, v11
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v12
-; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v51, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v5, v49, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v6, v50, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v29, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v37, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v23, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v10f32_to_v40i8:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; VI-NEXT:    ; implicit-def: $vgpr16
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr15
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr14
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr30
-; VI-NEXT:    ; implicit-def: $vgpr13
-; VI-NEXT:    ; implicit-def: $vgpr29
-; VI-NEXT:    ; implicit-def: $vgpr28
-; VI-NEXT:    ; implicit-def: $vgpr27
-; VI-NEXT:    ; implicit-def: $vgpr26
-; VI-NEXT:    ; implicit-def: $vgpr25
-; VI-NEXT:    ; implicit-def: $vgpr12
-; VI-NEXT:    ; implicit-def: $vgpr24
-; VI-NEXT:    ; implicit-def: $vgpr23
-; VI-NEXT:    ; implicit-def: $vgpr22
-; VI-NEXT:    ; implicit-def: $vgpr21
-; VI-NEXT:    ; implicit-def: $vgpr20
-; VI-NEXT:    ; implicit-def: $vgpr19
-; VI-NEXT:    ; implicit-def: $vgpr18
-; VI-NEXT:    ; implicit-def: $vgpr17
-; VI-NEXT:    ; implicit-def: $vgpr11
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB16_2
-; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB16_2: ; %Flow
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB16_4
-; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; VI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; VI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; VI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; VI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; VI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB16_4: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
-; VI-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; VI-NEXT:    v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v39
-; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
-; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v14
-; VI-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v32
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v13
-; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v27
-; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v26
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
-; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v24
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v22
-; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v21
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
-; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v19
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
-; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
-; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v10f32_to_v40i8:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr39
-; GFX9-NEXT:    ; implicit-def: $vgpr38
-; GFX9-NEXT:    ; implicit-def: $vgpr37
-; GFX9-NEXT:    ; implicit-def: $vgpr36
-; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    ; implicit-def: $vgpr14
-; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr32
-; GFX9-NEXT:    ; implicit-def: $vgpr31
-; GFX9-NEXT:    ; implicit-def: $vgpr30
-; GFX9-NEXT:    ; implicit-def: $vgpr13
-; GFX9-NEXT:    ; implicit-def: $vgpr29
-; GFX9-NEXT:    ; implicit-def: $vgpr28
-; GFX9-NEXT:    ; implicit-def: $vgpr27
-; GFX9-NEXT:    ; implicit-def: $vgpr26
-; GFX9-NEXT:    ; implicit-def: $vgpr25
-; GFX9-NEXT:    ; implicit-def: $vgpr12
-; GFX9-NEXT:    ; implicit-def: $vgpr24
-; GFX9-NEXT:    ; implicit-def: $vgpr23
-; GFX9-NEXT:    ; implicit-def: $vgpr22
-; GFX9-NEXT:    ; implicit-def: $vgpr21
-; GFX9-NEXT:    ; implicit-def: $vgpr20
-; GFX9-NEXT:    ; implicit-def: $vgpr19
-; GFX9-NEXT:    ; implicit-def: $vgpr18
-; GFX9-NEXT:    ; implicit-def: $vgpr17
-; GFX9-NEXT:    ; implicit-def: $vgpr11
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB16_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.false
-; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX9-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX9-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB16_2: ; %Flow
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB16_4
-; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GFX9-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX9-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GFX9-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX9-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; GFX9-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX9-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB16_4: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
-; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
-; GFX9-NEXT:    v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v39
-; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
-; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v14
-; GFX9-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v32
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v13
-; GFX9-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v27
-; GFX9-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v26
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
-; GFX9-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v24
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v22
-; GFX9-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v21
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
-; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v19
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
-; GFX9-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v10f32_to_v40i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB16_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB16_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB16_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9
-; GFX11-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1
-; GFX11-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7
-; GFX11-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3
-; GFX11-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB16_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v16
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v31, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v39
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v16
-; GFX11-NEXT:    v_or_b32_e32 v15, v48, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v14, v35, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v31
-; GFX11-NEXT:    v_or_b32_e32 v13, v30, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_or_b32_e32 v16, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v29
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v27
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v26
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v32
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v34
-; GFX11-NEXT:    v_or_b32_e32 v32, v33, v32
-; GFX11-NEXT:    v_or_b32_e32 v12, v25, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v16
-; GFX11-NEXT:    v_or_b32_e32 v11, v20, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v30
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v15
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
-  %a2 = bitcast <10 x float> %a1 to <40 x i8>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <10 x float> %a to <40 x i8>
-  br label %end
-
-end:
-  %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <40 x i8> %phi
-}
-
-define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
-; GCN-LABEL: bitcast_v40i8_to_v10f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    v_mov_b32_e32 v35, v8
-; GCN-NEXT:    v_mov_b32_e32 v34, v6
-; GCN-NEXT:    v_mov_b32_e32 v33, v4
-; GCN-NEXT:    v_mov_b32_e32 v32, v2
-; GCN-NEXT:    v_mov_b32_e32 v31, v0
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:28
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
-; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:36
-; GCN-NEXT:    v_lshlrev_b32_e32 v38, 8, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v36, 24, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v39, 8, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v37, 24, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v48, 8, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v49, 8, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 24, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v50, 8, v17
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 24, v19
-; GCN-NEXT:    v_lshlrev_b32_e32 v51, 8, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 24, v23
-; GCN-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 24, v27
-; GCN-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v53
-; GCN-NEXT:    v_lshlrev_b32_e32 v21, 24, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v23, 24, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v53, 8, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB17_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GCN-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GCN-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v12
-; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v14
-; GCN-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v20
-; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v22
-; GCN-NEXT:    v_and_b32_e32 v14, 0xff, v24
-; GCN-NEXT:    v_and_b32_e32 v16, 0xff, v26
-; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v28
-; GCN-NEXT:    v_and_b32_e32 v20, 0xff, v30
-; GCN-NEXT:    v_and_b32_e32 v22, 0xff, v41
-; GCN-NEXT:    v_and_b32_e32 v24, 0xff, v40
-; GCN-NEXT:    v_and_b32_e32 v26, 0xff, v55
-; GCN-NEXT:    v_and_b32_e32 v28, 0xff, v54
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v38
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v39
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v48
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v49
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v50
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v51
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v52
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v27
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GCN-NEXT:    v_or_b32_e32 v22, v22, v29
-; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GCN-NEXT:    v_or_b32_e32 v26, v26, v53
-; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v36, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v37, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v13, v7
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v15, v9
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_or_b32_e32 v11, v17, v12
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; GCN-NEXT:    v_or_b32_e32 v13, v19, v16
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v18
-; GCN-NEXT:    v_or_b32_e32 v15, v21, v20
-; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v22
-; GCN-NEXT:    v_or_b32_e32 v17, v23, v24
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v26
-; GCN-NEXT:    v_or_b32_e32 v19, v25, v27
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
-; GCN-NEXT:    v_or_b32_e32 v2, v4, v5
-; GCN-NEXT:    v_or_b32_e32 v3, v6, v7
-; GCN-NEXT:    v_or_b32_e32 v4, v8, v9
-; GCN-NEXT:    v_or_b32_e32 v5, v10, v11
-; GCN-NEXT:    v_or_b32_e32 v6, v12, v13
-; GCN-NEXT:    v_or_b32_e32 v7, v14, v15
-; GCN-NEXT:    v_or_b32_e32 v8, v16, v17
-; GCN-NEXT:    v_or_b32_e32 v9, v18, v19
-; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr33
-; GCN-NEXT:    ; implicit-def: $vgpr34
-; GCN-NEXT:    ; implicit-def: $vgpr35
-; GCN-NEXT:    ; implicit-def: $vgpr10
-; GCN-NEXT:    ; implicit-def: $vgpr12
-; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr41
-; GCN-NEXT:    ; implicit-def: $vgpr40
-; GCN-NEXT:    ; implicit-def: $vgpr55
-; GCN-NEXT:    ; implicit-def: $vgpr54
-; GCN-NEXT:    ; implicit-def: $vgpr38
-; GCN-NEXT:    ; implicit-def: $vgpr36
-; GCN-NEXT:    ; implicit-def: $vgpr39
-; GCN-NEXT:    ; implicit-def: $vgpr37
-; GCN-NEXT:    ; implicit-def: $vgpr48
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr49
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr50
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr51
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr52
-; GCN-NEXT:    ; implicit-def: $vgpr19
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr53
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:  .LBB17_2: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB17_4
-; GCN-NEXT:  ; %bb.3: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v31
-; GCN-NEXT:    s_movk_i32 s6, 0x300
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v32
-; GCN-NEXT:    s_mov_b32 s7, 0x3000000
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v33
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v34
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v35
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v10
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v12
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v14
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v16
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v18
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 3, v20
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 3, v22
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 3, v24
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 3, v26
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 3, v28
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 3, v30
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 3, v40
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 3, v55
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 3, v54
-; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GCN-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GCN-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GCN-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GCN-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GCN-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GCN-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GCN-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GCN-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GCN-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GCN-NEXT:    v_or_b32_e32 v0, v38, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v39, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v48, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v49, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v50, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_or_b32_e32 v10, v51, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_or_b32_e32 v14, v52, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GCN-NEXT:    v_or_b32_e32 v18, v27, v18
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GCN-NEXT:    v_or_b32_e32 v22, v29, v22
-; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GCN-NEXT:    v_or_b32_e32 v26, v53, v26
-; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v36, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v37, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s6, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, s6, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v13, v7
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, s6, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v15, v9
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, s6, v10
-; GCN-NEXT:    v_or_b32_e32 v11, v17, v12
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, s6, v14
-; GCN-NEXT:    v_or_b32_e32 v13, v19, v16
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, s6, v18
-; GCN-NEXT:    v_or_b32_e32 v15, v21, v20
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, s6, v22
-; GCN-NEXT:    v_or_b32_e32 v17, v23, v24
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 0x300, v26
-; GCN-NEXT:    v_or_b32_e32 v19, v25, v27
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
-; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
-; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
-; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
-; GCN-NEXT:    v_or_b32_e32 v8, v17, v16
-; GCN-NEXT:    v_or_b32_e32 v9, v19, v18
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s7, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s7, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s7, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, s7, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x3000000, v8
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v9
-; GCN-NEXT:  .LBB17_4: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v40i8_to_v10f32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v35, v8
-; VI-NEXT:    v_mov_b32_e32 v34, v6
-; VI-NEXT:    v_mov_b32_e32 v33, v4
-; VI-NEXT:    v_mov_b32_e32 v32, v2
-; VI-NEXT:    v_mov_b32_e32 v31, v0
-; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32
-; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
-; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
-; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v1
-; VI-NEXT:    v_lshlrev_b16_e32 v42, 8, v3
-; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
-; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
-; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v9
-; VI-NEXT:    v_lshlrev_b16_e32 v54, 8, v11
-; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v13
-; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v15
-; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v17
-; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v19
-; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v21
-; VI-NEXT:    v_lshlrev_b16_e32 v48, 8, v23
-; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
-; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
-; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v0
-; VI-NEXT:    s_waitcnt vmcnt(8)
-; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v2
-; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v4
-; VI-NEXT:    s_waitcnt vmcnt(6)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b16_e32 v11, 8, v8
-; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v13, 8, v44
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB17_2
-; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr10
-; VI-NEXT:    ; implicit-def: $vgpr12
-; VI-NEXT:    ; implicit-def: $vgpr14
-; VI-NEXT:    ; implicit-def: $vgpr16
-; VI-NEXT:    ; implicit-def: $vgpr18
-; VI-NEXT:    ; implicit-def: $vgpr20
-; VI-NEXT:    ; implicit-def: $vgpr22
-; VI-NEXT:    ; implicit-def: $vgpr24
-; VI-NEXT:    ; implicit-def: $vgpr26
-; VI-NEXT:    ; implicit-def: $vgpr28
-; VI-NEXT:    ; implicit-def: $vgpr30
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr41
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr55
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr25
-; VI-NEXT:    ; implicit-def: $vgpr23
-; VI-NEXT:    ; implicit-def: $vgpr21
-; VI-NEXT:    ; implicit-def: $vgpr19
-; VI-NEXT:    ; implicit-def: $vgpr17
-; VI-NEXT:    ; implicit-def: $vgpr15
-; VI-NEXT:    ; implicit-def: $vgpr13
-; VI-NEXT:    ; implicit-def: $vgpr11
-; VI-NEXT:  .LBB17_2: ; %Flow
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB17_4
-; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_add_u16_e32 v0, 3, v31
-; VI-NEXT:    v_add_u16_e32 v1, 3, v32
-; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_mov_b32_e32 v9, 0x300
-; VI-NEXT:    v_add_u16_e32 v0, 0x300, v0
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_add_u16_e32 v1, 3, v33
-; VI-NEXT:    v_add_u16_e32 v2, 3, v34
-; VI-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v1, 0x300, v1
-; VI-NEXT:    v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_add_u16_e32 v2, 3, v35
-; VI-NEXT:    v_add_u16_e32 v3, 3, v10
-; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v2, 0x300, v2
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-NEXT:    v_add_u16_e32 v3, 3, v12
-; VI-NEXT:    v_add_u16_e32 v4, 3, v14
-; VI-NEXT:    v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v3, 0x300, v3
-; VI-NEXT:    v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v3, v3, v4
-; VI-NEXT:    v_add_u16_e32 v4, 3, v16
-; VI-NEXT:    v_add_u16_e32 v5, 3, v18
-; VI-NEXT:    v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v4, 0x300, v4
-; VI-NEXT:    v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v4, v4, v5
-; VI-NEXT:    v_add_u16_e32 v5, 3, v20
-; VI-NEXT:    v_add_u16_e32 v6, 3, v22
-; VI-NEXT:    v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v5, 0x300, v5
-; VI-NEXT:    v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v5, v5, v6
-; VI-NEXT:    v_add_u16_e32 v6, 3, v24
-; VI-NEXT:    v_add_u16_e32 v7, 3, v26
-; VI-NEXT:    v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v6, 0x300, v6
-; VI-NEXT:    v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v6, v6, v7
-; VI-NEXT:    v_add_u16_e32 v7, 3, v28
-; VI-NEXT:    v_add_u16_e32 v8, 3, v30
-; VI-NEXT:    v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
-; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v7, v7, v8
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v8, 3, v39
-; VI-NEXT:    v_add_u16_e32 v10, 3, v38
-; VI-NEXT:    v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v8, 0x300, v8
-; VI-NEXT:    v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v8, v8, v10
-; VI-NEXT:    v_add_u16_e32 v10, 3, v37
-; VI-NEXT:    v_add_u16_e32 v12, 3, v36
-; VI-NEXT:    v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v10, 0x300, v10
-; VI-NEXT:    v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:  .LBB17_4: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v40i8_to_v10f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v35, v8
-; GFX9-NEXT:    v_mov_b32_e32 v34, v6
-; GFX9-NEXT:    v_mov_b32_e32 v33, v4
-; GFX9-NEXT:    v_mov_b32_e32 v32, v2
-; GFX9-NEXT:    v_mov_b32_e32 v31, v0
-; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_lshlrev_b16_e32 v43, 8, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v42, 8, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
-; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
-; GFX9-NEXT:    v_lshlrev_b16_e32 v55, 8, v9
-; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v11
-; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v13
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v15
-; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v17
-; GFX9-NEXT:    v_lshlrev_b16_e32 v50, 8, v19
-; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v21
-; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v23
-; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
-; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v2
-; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v4
-; GFX9-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v8
-; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v44
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB17_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.false
-; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    ; implicit-def: $vgpr31
-; GFX9-NEXT:    ; implicit-def: $vgpr32
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    ; implicit-def: $vgpr10
-; GFX9-NEXT:    ; implicit-def: $vgpr12
-; GFX9-NEXT:    ; implicit-def: $vgpr14
-; GFX9-NEXT:    ; implicit-def: $vgpr16
-; GFX9-NEXT:    ; implicit-def: $vgpr18
-; GFX9-NEXT:    ; implicit-def: $vgpr20
-; GFX9-NEXT:    ; implicit-def: $vgpr22
-; GFX9-NEXT:    ; implicit-def: $vgpr24
-; GFX9-NEXT:    ; implicit-def: $vgpr26
-; GFX9-NEXT:    ; implicit-def: $vgpr28
-; GFX9-NEXT:    ; implicit-def: $vgpr30
-; GFX9-NEXT:    ; implicit-def: $vgpr39
-; GFX9-NEXT:    ; implicit-def: $vgpr38
-; GFX9-NEXT:    ; implicit-def: $vgpr37
-; GFX9-NEXT:    ; implicit-def: $vgpr36
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    ; implicit-def: $vgpr42
-; GFX9-NEXT:    ; implicit-def: $vgpr41
-; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr55
-; GFX9-NEXT:    ; implicit-def: $vgpr54
-; GFX9-NEXT:    ; implicit-def: $vgpr53
-; GFX9-NEXT:    ; implicit-def: $vgpr52
-; GFX9-NEXT:    ; implicit-def: $vgpr51
-; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr49
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr25
-; GFX9-NEXT:    ; implicit-def: $vgpr23
-; GFX9-NEXT:    ; implicit-def: $vgpr21
-; GFX9-NEXT:    ; implicit-def: $vgpr19
-; GFX9-NEXT:    ; implicit-def: $vgpr17
-; GFX9-NEXT:    ; implicit-def: $vgpr15
-; GFX9-NEXT:    ; implicit-def: $vgpr13
-; GFX9-NEXT:    ; implicit-def: $vgpr11
-; GFX9-NEXT:  .LBB17_2: ; %Flow
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB17_4
-; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v31
-; GFX9-NEXT:    v_add_u16_e32 v1, 3, v32
-; GFX9-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    s_movk_i32 s6, 0x300
-; GFX9-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v0, 0x300, v0
-; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_add_u16_e32 v1, 3, v33
-; GFX9-NEXT:    v_add_u16_e32 v2, 3, v34
-; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v1, 0x300, v1
-; GFX9-NEXT:    v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX9-NEXT:    v_add_u16_e32 v2, 3, v35
-; GFX9-NEXT:    v_add_u16_e32 v3, 3, v10
-; GFX9-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v2, 0x300, v2
-; GFX9-NEXT:    v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX9-NEXT:    v_add_u16_e32 v3, 3, v12
-; GFX9-NEXT:    v_add_u16_e32 v4, 3, v14
-; GFX9-NEXT:    v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v3, 0x300, v3
-; GFX9-NEXT:    v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_add_u16_e32 v4, 3, v16
-; GFX9-NEXT:    v_add_u16_e32 v5, 3, v18
-; GFX9-NEXT:    v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v4, 0x300, v4
-; GFX9-NEXT:    v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX9-NEXT:    v_add_u16_e32 v5, 3, v20
-; GFX9-NEXT:    v_add_u16_e32 v6, 3, v22
-; GFX9-NEXT:    v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v5, 0x300, v5
-; GFX9-NEXT:    v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX9-NEXT:    v_add_u16_e32 v6, 3, v24
-; GFX9-NEXT:    v_add_u16_e32 v7, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v6, 0x300, v6
-; GFX9-NEXT:    v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_u16_e32 v7, 3, v28
-; GFX9-NEXT:    v_add_u16_e32 v8, 3, v30
-; GFX9-NEXT:    v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
-; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v8, 3, v39
-; GFX9-NEXT:    v_add_u16_e32 v9, 3, v38
-; GFX9-NEXT:    v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v8, 0x300, v8
-; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX9-NEXT:    v_add_u16_e32 v9, 3, v37
-; GFX9-NEXT:    v_add_u16_e32 v10, 3, v36
-; GFX9-NEXT:    v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v9, 0x300, v9
-; GFX9-NEXT:    v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX9-NEXT:  .LBB17_4: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v40i8_to_v10f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6
-; GFX11-NEXT:    v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2
-; GFX11-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-NEXT:    s_clause 0x9
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v37, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_u16 v38, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v66
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB17_4
-; GFX11-NEXT:  .LBB17_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB17_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v53
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v54
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v55
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v64
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v65
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v48
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v49
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v50
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v51
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v52
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v39
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v36
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v23
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v25
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v27
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v29
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v12, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v13, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB17_2
-; GFX11-NEXT:  .LBB17_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v10, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v12, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v18, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v53, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v54, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v55, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v64, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v65, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v48, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v49, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v50, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v51, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v52, v9
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_add_nc_u16 v5, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v12, v39, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v36, 3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v5, v21, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v23, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v25, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v27, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v29, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v13, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v15, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v17, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v19, v18
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = add <40 x i8> %a, splat (i8 3)
-  %a2 = bitcast <40 x i8> %a1 to <10 x float>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <40 x i8> %a to <10 x float>
-  br label %end
-
-end:
-  %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <10 x float> %phi
-}
-
-define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) {
-; GCN-LABEL: bitcast_v10f32_to_v5f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB18_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.true
-; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GCN-NEXT:  .LBB18_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v10f32_to_v5f64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB18_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; VI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; VI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; VI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; VI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; VI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; VI-NEXT:  .LBB18_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v10f32_to_v5f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB18_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GFX9-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GFX9-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; GFX9-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:  .LBB18_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v10f32_to_v5f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
-  %a2 = bitcast <10 x float> %a1 to <5 x double>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <10 x float> %a to <5 x double>
-  br label %end
-
-end:
-  %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <5 x double> %phi
-}
-
-define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) {
-; GCN-LABEL: bitcast_v5f64_to_v10f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB19_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.true
-; GCN-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GCN-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GCN-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GCN-NEXT:  .LBB19_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v5f64_to_v10f32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB19_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; VI-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; VI-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; VI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; VI-NEXT:  .LBB19_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v5f64_to_v10f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB19_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:  .LBB19_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v5f64_to_v10f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB19_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
-; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:  .LBB19_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = fadd <5 x double> %a, splat (double 1.000000e+00)
-  %a2 = bitcast <5 x double> %a1 to <10 x float>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <5 x double> %a to <10 x float>
-  br label %end
-
-end:
-  %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <10 x float> %phi
-}
-
-define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) {
-; GCN-LABEL: bitcast_v10f32_to_v5i64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB20_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.true
-; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GCN-NEXT:  .LBB20_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v10f32_to_v5i64:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB20_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; VI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; VI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; VI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; VI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; VI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; VI-NEXT:  .LBB20_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v10f32_to_v5i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB20_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GFX9-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; GFX9-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; GFX9-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; GFX9-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX9-NEXT:  .LBB20_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v10f32_to_v5i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
-; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
-; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
-; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
-  %a2 = bitcast <10 x float> %a1 to <5 x i64>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <10 x float> %a to <5 x i64>
-  br label %end
-
-end:
-  %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <5 x i64> %phi
-}
-
-define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) {
-; GCN-LABEL: bitcast_v5i64_to_v10f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB21_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:  .LBB21_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v5i64_to_v10f32:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB21_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
-; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v6
-; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v4
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
-; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:  .LBB21_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v5i64_to_v10f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB21_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 3, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:  .LBB21_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v5i64_to_v10f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB21_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v6, vcc_lo, v6, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v4, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:  .LBB21_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = add <5 x i64> %a, splat (i64 3)
-  %a2 = bitcast <5 x i64> %a1 to <10 x float>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <5 x i64> %a to <10 x float>
-  br label %end
-
-end:
-  %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <10 x float> %phi
-}
-
-define <20 x half> @bitcast_v20i16_to_v20f16(<20 x i16> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20i16_to_v20f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v39, v19
-; GCN-NEXT:    v_mov_b32_e32 v38, v18
-; GCN-NEXT:    v_mov_b32_e32 v37, v17
-; GCN-NEXT:    v_mov_b32_e32 v36, v16
-; GCN-NEXT:    v_mov_b32_e32 v35, v15
-; GCN-NEXT:    v_mov_b32_e32 v34, v14
-; GCN-NEXT:    v_mov_b32_e32 v33, v13
-; GCN-NEXT:    v_mov_b32_e32 v32, v12
-; GCN-NEXT:    v_mov_b32_e32 v31, v11
-; GCN-NEXT:    v_mov_b32_e32 v30, v10
-; GCN-NEXT:    v_mov_b32_e32 v29, v9
-; GCN-NEXT:    v_mov_b32_e32 v28, v8
-; GCN-NEXT:    v_mov_b32_e32 v27, v7
-; GCN-NEXT:    v_mov_b32_e32 v26, v6
-; GCN-NEXT:    v_mov_b32_e32 v25, v5
-; GCN-NEXT:    v_mov_b32_e32 v24, v4
-; GCN-NEXT:    v_mov_b32_e32 v23, v3
-; GCN-NEXT:    v_mov_b32_e32 v22, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v1
-; GCN-NEXT:    v_mov_b32_e32 v48, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr1
-; GCN-NEXT:    ; implicit-def: $vgpr2
-; GCN-NEXT:    ; implicit-def: $vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr4
-; GCN-NEXT:    ; implicit-def: $vgpr5
-; GCN-NEXT:    ; implicit-def: $vgpr6
-; GCN-NEXT:    ; implicit-def: $vgpr7
-; GCN-NEXT:    ; implicit-def: $vgpr8
-; GCN-NEXT:    ; implicit-def: $vgpr9
-; GCN-NEXT:    ; implicit-def: $vgpr10
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr12
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr19
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB22_3
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB22_4
-; GCN-NEXT:  .LBB22_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB22_3: ; %cmp.false
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v48
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v21
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v22
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v23
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v24
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v25
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v26
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v27
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v28
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v29
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v30
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v31
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v32
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v33
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v34
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v35
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v36
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v37
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v38
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v39
-; GCN-NEXT:    ; implicit-def: $vgpr48
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr33
-; GCN-NEXT:    ; implicit-def: $vgpr34
-; GCN-NEXT:    ; implicit-def: $vgpr35
-; GCN-NEXT:    ; implicit-def: $vgpr36
-; GCN-NEXT:    ; implicit-def: $vgpr37
-; GCN-NEXT:    ; implicit-def: $vgpr38
-; GCN-NEXT:    ; implicit-def: $vgpr39
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB22_2
-; GCN-NEXT:  .LBB22_4: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 3, v39
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 3, v38
-; GCN-NEXT:    v_add_i32_e32 v17, vcc, 3, v37
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 3, v36
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 3, v35
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 3, v34
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 3, v33
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 3, v32
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 3, v31
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 3, v30
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v29
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v28
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v27
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v26
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v25
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v24
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v23
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v22
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v21
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v48
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v20i16_to_v20f16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB22_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v10, 3
-; VI-NEXT:    v_add_u16_sdwa v11, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v12, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v13, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v14, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v15, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v16, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v17, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v18, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v19, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v10, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v9, 3, v9
-; VI-NEXT:    v_add_u16_e32 v8, 3, v8
-; VI-NEXT:    v_add_u16_e32 v7, 3, v7
-; VI-NEXT:    v_add_u16_e32 v6, 3, v6
-; VI-NEXT:    v_add_u16_e32 v5, 3, v5
-; VI-NEXT:    v_add_u16_e32 v4, 3, v4
-; VI-NEXT:    v_add_u16_e32 v3, 3, v3
-; VI-NEXT:    v_add_u16_e32 v2, 3, v2
-; VI-NEXT:    v_add_u16_e32 v1, 3, v1
-; VI-NEXT:    v_add_u16_e32 v0, 3, v0
-; VI-NEXT:    v_or_b32_e32 v9, v9, v10
-; VI-NEXT:    v_or_b32_e32 v8, v8, v19
-; VI-NEXT:    v_or_b32_e32 v7, v7, v18
-; VI-NEXT:    v_or_b32_e32 v6, v6, v17
-; VI-NEXT:    v_or_b32_e32 v5, v5, v16
-; VI-NEXT:    v_or_b32_e32 v4, v4, v15
-; VI-NEXT:    v_or_b32_e32 v3, v3, v14
-; VI-NEXT:    v_or_b32_e32 v2, v2, v13
-; VI-NEXT:    v_or_b32_e32 v1, v1, v12
-; VI-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NEXT:  .LBB22_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v20i16_to_v20f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB22_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB22_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v20i16_to_v20f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB22_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:  .LBB22_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = add <20 x i16> %a, splat (i16 3)
-  %a2 = bitcast <20 x i16> %a1 to <20 x half>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <20 x i16> %a to <20 x half>
-  br label %end
-
-end:
-  %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x half> %phi
-}
-
-define <20 x i16> @bitcast_v20f16_to_v20i16(<20 x half> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20f16_to_v20i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB23_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.true
-; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; GCN-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; GCN-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; GCN-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v20
-; GCN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; GCN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; GCN-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; GCN-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; GCN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v20
-; GCN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; GCN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; GCN-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; GCN-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v11
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v20
-; GCN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GCN-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; GCN-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v20
-; GCN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GCN-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; GCN-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v20
-; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GCN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; GCN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GCN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; GCN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GCN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; GCN-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; GCN-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; GCN-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
-; GCN-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; GCN-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; GCN-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; GCN-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; GCN-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; GCN-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; GCN-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; GCN-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GCN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GCN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GCN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v5
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v9
-; GCN-NEXT:    v_or_b32_e32 v12, v12, v13
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v17
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; GCN-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; GCN-NEXT:  .LBB23_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-LABEL: bitcast_v20f16_to_v20i16:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB23_2
-; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v11, 0x200
-; VI-NEXT:    v_add_f16_e32 v10, 0x200, v0
-; VI-NEXT:    v_add_f16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v12, 0x200, v1
-; VI-NEXT:    v_add_f16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v13, 0x200, v2
-; VI-NEXT:    v_add_f16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v14, 0x200, v3
-; VI-NEXT:    v_add_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v15, 0x200, v4
-; VI-NEXT:    v_add_f16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v16, 0x200, v5
-; VI-NEXT:    v_add_f16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v17, 0x200, v6
-; VI-NEXT:    v_add_f16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v18, 0x200, v7
-; VI-NEXT:    v_add_f16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v19, 0x200, v8
-; VI-NEXT:    v_add_f16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_sdwa v11, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_f16_e32 v9, 0x200, v9
-; VI-NEXT:    v_or_b32_e32 v9, v9, v11
-; VI-NEXT:    v_or_b32_e32 v8, v19, v8
-; VI-NEXT:    v_or_b32_e32 v7, v18, v7
-; VI-NEXT:    v_or_b32_e32 v6, v17, v6
-; VI-NEXT:    v_or_b32_e32 v5, v16, v5
-; VI-NEXT:    v_or_b32_e32 v4, v15, v4
-; VI-NEXT:    v_or_b32_e32 v3, v14, v3
-; VI-NEXT:    v_or_b32_e32 v2, v13, v2
-; VI-NEXT:    v_or_b32_e32 v1, v12, v1
-; VI-NEXT:    v_or_b32_e32 v0, v10, v0
-; VI-NEXT:  .LBB23_2: ; %end
-; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: bitcast_v20f16_to_v20i16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB23_2
-; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    s_movk_i32 s6, 0x200
-; GFX9-NEXT:    v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB23_2: ; %end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: bitcast_v20f16_to_v20i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v10
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB23_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:  .LBB23_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %cmp = icmp eq i32 %b, 0
-  br i1 %cmp, label %cmp.true, label %cmp.false
-
-cmp.true:
-  %a1 = fadd <20 x half> %a, splat (half 0xH0200)
-  %a2 = bitcast <20 x half> %a1 to <20 x i16>
-  br label %end
-
-cmp.false:
-  %a3 = bitcast <20 x half> %a to <20 x i16>
-  br label %end
-
-end:
-  %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x i16> %phi
-}
-
-define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20i16_to_v40i8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v56, 16, v2
-; GCN-NEXT:    s_waitcnt expcnt(6)
-; GCN-NEXT:    v_lshlrev_b32_e32 v57, 16, v4
-; GCN-NEXT:    s_waitcnt expcnt(5)
-; GCN-NEXT:    v_lshlrev_b32_e32 v58, 16, v6
-; GCN-NEXT:    s_waitcnt expcnt(4)
-; GCN-NEXT:    v_lshlrev_b32_e32 v59, 16, v8
-; GCN-NEXT:    s_waitcnt expcnt(3)
-; GCN-NEXT:    v_lshlrev_b32_e32 v60, 16, v10
-; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    v_lshlrev_b32_e32 v61, 16, v12
-; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v62, 16, v14
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v63, 16, v16
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v20
-; GCN-NEXT:    ; implicit-def: $vgpr50
-; GCN-NEXT:    ; implicit-def: $vgpr44
-; GCN-NEXT:    ; implicit-def: $vgpr43
-; GCN-NEXT:    ; implicit-def: $vgpr41
-; GCN-NEXT:    ; implicit-def: $vgpr39
-; GCN-NEXT:    ; implicit-def: $vgpr47
-; GCN-NEXT:    ; implicit-def: $vgpr51
-; GCN-NEXT:    ; implicit-def: $vgpr54
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GCN-NEXT:    ; implicit-def: $vgpr35
+; GCN-NEXT:    ; implicit-def: $vgpr33
 ; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr42
-; GCN-NEXT:    ; implicit-def: $vgpr55
-; GCN-NEXT:    ; implicit-def: $vgpr53
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr46
-; GCN-NEXT:    ; implicit-def: $vgpr34
-; GCN-NEXT:    ; implicit-def: $vgpr36
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr52
 ; GCN-NEXT:    ; implicit-def: $vgpr48
+; GCN-NEXT:    ; implicit-def: $vgpr39
+; GCN-NEXT:    ; implicit-def: $vgpr38
+; GCN-NEXT:    ; implicit-def: $vgpr29
+; GCN-NEXT:    ; implicit-def: $vgpr27
+; GCN-NEXT:    ; implicit-def: $vgpr26
 ; GCN-NEXT:    ; implicit-def: $vgpr37
+; GCN-NEXT:    ; implicit-def: $vgpr36
+; GCN-NEXT:    ; implicit-def: $vgpr34
+; GCN-NEXT:    ; implicit-def: $vgpr23
 ; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr45
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr14
-; GCN-NEXT:    ; implicit-def: $vgpr35
-; GCN-NEXT:    ; implicit-def: $vgpr33
+; GCN-NEXT:    ; implicit-def: $vgpr20
 ; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr10
-; GCN-NEXT:    ; implicit-def: $vgpr40
-; GCN-NEXT:    ; implicit-def: $vgpr2
-; GCN-NEXT:    ; kill: killed $vgpr2
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr6
 ; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr27
+; GCN-NEXT:    ; implicit-def: $vgpr28
+; GCN-NEXT:    ; implicit-def: $vgpr17
+; GCN-NEXT:    ; implicit-def: $vgpr15
+; GCN-NEXT:    ; implicit-def: $vgpr14
 ; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr2
-; GCN-NEXT:    ; implicit-def: $vgpr49
-; GCN-NEXT:    ; implicit-def: $vgpr38
-; GCN-NEXT:    ; kill: killed $vgpr38
-; GCN-NEXT:    ; implicit-def: $vgpr38
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB24_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v9
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v11
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v13
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v15
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v17
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff, v19
-; GCN-NEXT:    v_and_b32_e32 v51, 0xffff, v4
-; GCN-NEXT:    v_and_b32_e32 v34, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v26, 0xffff, v12
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v16
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v20
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-NEXT:    v_bfe_u32 v54, v4, 8, 8
-; GCN-NEXT:    v_bfe_u32 v36, v8, 8, 8
-; GCN-NEXT:    v_bfe_u32 v29, v12, 8, 8
-; GCN-NEXT:    v_bfe_u32 v24, v16, 8, 8
-; GCN-NEXT:    v_or_b32_e32 v50, v1, v56
-; GCN-NEXT:    v_or_b32_e32 v39, v2, v57
-; GCN-NEXT:    v_or_b32_e32 v32, v3, v58
-; GCN-NEXT:    v_or_b32_e32 v28, v5, v59
-; GCN-NEXT:    v_or_b32_e32 v23, v6, v60
-; GCN-NEXT:    v_or_b32_e32 v21, v7, v61
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_or_b32_e32 v14, v9, v62
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v63
-; GCN-NEXT:    v_or_b32_e32 v6, v11, v18
-; GCN-NEXT:    v_or_b32_e32 v2, v13, v22
-; GCN-NEXT:    v_alignbit_b32 v41, v39, v50, 24
-; GCN-NEXT:    v_alignbit_b32 v43, v39, v50, 16
-; GCN-NEXT:    v_alignbit_b32 v44, v39, v50, 8
-; GCN-NEXT:    v_alignbit_b32 v53, v28, v32, 24
-; GCN-NEXT:    v_alignbit_b32 v55, v28, v32, 16
-; GCN-NEXT:    v_alignbit_b32 v42, v28, v32, 8
-; GCN-NEXT:    v_alignbit_b32 v37, v21, v23, 24
-; GCN-NEXT:    v_alignbit_b32 v48, v21, v23, 16
-; GCN-NEXT:    v_alignbit_b32 v52, v21, v23, 8
-; GCN-NEXT:    v_alignbit_b32 v31, v10, v14, 24
-; GCN-NEXT:    v_alignbit_b32 v33, v10, v14, 16
-; GCN-NEXT:    v_alignbit_b32 v35, v10, v14, 8
-; GCN-NEXT:    v_alignbit_b32 v25, v2, v6, 24
-; GCN-NEXT:    v_alignbit_b32 v27, v2, v6, 16
-; GCN-NEXT:    v_alignbit_b32 v30, v2, v6, 8
-; GCN-NEXT:    v_lshrrev_b32_e32 v47, 8, v39
-; GCN-NEXT:    v_lshrrev_b32_e32 v46, 8, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v45, 8, v21
-; GCN-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v49, 8, v2
-; GCN-NEXT:    v_bfe_u32 v38, v20, 8, 8
-; GCN-NEXT:    ; implicit-def: $vgpr1
-; GCN-NEXT:    ; implicit-def: $vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr5
-; GCN-NEXT:    ; implicit-def: $vgpr7
-; GCN-NEXT:    ; implicit-def: $vgpr9
-; GCN-NEXT:    ; implicit-def: $vgpr11
+; GCN-NEXT:    ; implicit-def: $vgpr24
+; GCN-NEXT:    ; implicit-def: $vgpr22
 ; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr17
+; GCN-NEXT:    ; implicit-def: $vgpr12
+; GCN-NEXT:    ; implicit-def: $vgpr11
 ; GCN-NEXT:    ; implicit-def: $vgpr19
-; GCN-NEXT:    ; implicit-def: $vgpr56
-; GCN-NEXT:    ; implicit-def: $vgpr57
-; GCN-NEXT:    ; implicit-def: $vgpr58
-; GCN-NEXT:    ; implicit-def: $vgpr59
-; GCN-NEXT:    ; implicit-def: $vgpr60
-; GCN-NEXT:    ; implicit-def: $vgpr61
-; GCN-NEXT:    ; implicit-def: $vgpr62
-; GCN-NEXT:    ; implicit-def: $vgpr63
 ; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:  .LBB24_2: ; %Flow
+; GCN-NEXT:    ; implicit-def: $vgpr16
+; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GCN-NEXT:    s_cbranch_execz .LBB12_2
+; GCN-NEXT:  ; %bb.1: ; %cmp.false
+; GCN-NEXT:    v_alignbit_b32 v11, v10, v9, 24
+; GCN-NEXT:    v_alignbit_b32 v12, v10, v9, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v10, v9, 8
+; GCN-NEXT:    v_alignbit_b32 v14, v8, v7, 24
+; GCN-NEXT:    v_alignbit_b32 v15, v8, v7, 16
+; GCN-NEXT:    v_alignbit_b32 v17, v8, v7, 8
+; GCN-NEXT:    v_alignbit_b32 v20, v6, v5, 24
+; GCN-NEXT:    v_alignbit_b32 v21, v6, v5, 16
+; GCN-NEXT:    v_alignbit_b32 v23, v6, v5, 8
+; GCN-NEXT:    v_alignbit_b32 v26, v4, v3, 24
+; GCN-NEXT:    v_alignbit_b32 v27, v4, v3, 16
+; GCN-NEXT:    v_alignbit_b32 v29, v4, v3, 8
+; GCN-NEXT:    v_alignbit_b32 v32, v2, v1, 24
+; GCN-NEXT:    v_alignbit_b32 v33, v2, v1, 16
+; GCN-NEXT:    v_alignbit_b32 v35, v2, v1, 8
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 24, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v25, 8, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v28, 24, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v34, 24, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v37, 8, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
+; GCN-NEXT:  .LBB12_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB24_4
+; GCN-NEXT:    s_cbranch_execz .LBB12_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v17
-; GCN-NEXT:    s_mov_b32 s6, 0x30000
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v19
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v13
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v15
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v9
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 3, v11
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_alignbit_b32 v11, v10, v9, 24
+; GCN-NEXT:    v_alignbit_b32 v12, v10, v9, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v10, v9, 8
+; GCN-NEXT:    v_alignbit_b32 v14, v8, v7, 24
+; GCN-NEXT:    v_alignbit_b32 v15, v8, v7, 16
+; GCN-NEXT:    v_alignbit_b32 v17, v8, v7, 8
+; GCN-NEXT:    v_alignbit_b32 v20, v6, v5, 24
+; GCN-NEXT:    v_alignbit_b32 v21, v6, v5, 16
+; GCN-NEXT:    v_alignbit_b32 v23, v6, v5, 8
+; GCN-NEXT:    v_alignbit_b32 v26, v4, v3, 24
+; GCN-NEXT:    v_alignbit_b32 v27, v4, v3, 16
+; GCN-NEXT:    v_alignbit_b32 v29, v4, v3, 8
+; GCN-NEXT:    v_alignbit_b32 v32, v2, v1, 24
+; GCN-NEXT:    v_alignbit_b32 v33, v2, v1, 16
+; GCN-NEXT:    v_alignbit_b32 v35, v2, v1, 8
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 24, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v24, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v25, 8, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v28, 24, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v34, 24, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v37, 8, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
+; GCN-NEXT:  .LBB12_4: ; %end
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_and_b32_e32 v49, 0xff, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
+; GCN-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GCN-NEXT:    v_lshlrev_b32_e32 v32, 24, v32
+; GCN-NEXT:    v_and_b32_e32 v50, 0xff, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v48, 8, v48
+; GCN-NEXT:    v_and_b32_e32 v39, 0xff, v39
+; GCN-NEXT:    v_lshlrev_b32_e32 v38, 24, v38
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; GCN-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
+; GCN-NEXT:    v_and_b32_e32 v27, 0xff, v27
+; GCN-NEXT:    v_lshlrev_b32_e32 v26, 24, v26
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v37, 8, v37
+; GCN-NEXT:    v_and_b32_e32 v36, 0xff, v36
+; GCN-NEXT:    v_lshlrev_b32_e32 v34, 24, v34
+; GCN-NEXT:    v_add_i32_e32 v51, vcc, 12, v0
+; GCN-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
+; GCN-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 24, v20
+; GCN-NEXT:    v_or_b32_e32 v35, v49, v35
+; GCN-NEXT:    v_add_i32_e32 v49, vcc, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v31, 8, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 24, v28
+; GCN-NEXT:    v_or_b32_e32 v48, v50, v48
+; GCN-NEXT:    v_add_i32_e32 v50, vcc, 20, v0
+; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; GCN-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v29
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 24, v0
+; GCN-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GCN-NEXT:    v_lshlrev_b32_e32 v22, 24, v22
+; GCN-NEXT:    v_or_b32_e32 v4, v4, v37
+; GCN-NEXT:    v_add_i32_e32 v37, vcc, 28, v0
+; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GCN-NEXT:    v_or_b32_e32 v5, v5, v23
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 32, v0
+; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
+; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
+; GCN-NEXT:    v_or_b32_e32 v6, v6, v31
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 36, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; GCN-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GCN-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT:    v_or_b32_e32 v7, v7, v17
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_or_b32_e32 v8, v8, v25
+; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v24
+; GCN-NEXT:    v_or_b32_e32 v9, v9, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_or_b32_e32 v10, v10, v19
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v35
+; GCN-NEXT:    v_or_b32_e32 v19, v32, v33
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff, v48
+; GCN-NEXT:    v_or_b32_e32 v25, v38, v39
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NEXT:    v_or_b32_e32 v26, v26, v27
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_or_b32_e32 v27, v34, v36
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GCN-NEXT:    v_or_b32_e32 v20, v20, v21
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    v_or_b32_e32 v21, v28, v30
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v15
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT:    v_or_b32_e32 v15, v22, v17
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GCN-NEXT:    v_or_b32_e32 v11, v11, v12
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT:    v_or_b32_e32 v2, v18, v2
-; GCN-NEXT:    v_or_b32_e32 v4, v22, v4
-; GCN-NEXT:    v_or_b32_e32 v11, v62, v6
-; GCN-NEXT:    v_or_b32_e32 v8, v63, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v60, v9
-; GCN-NEXT:    v_or_b32_e32 v12, v61, v10
-; GCN-NEXT:    v_or_b32_e32 v5, v58, v5
-; GCN-NEXT:    v_or_b32_e32 v7, v59, v7
-; GCN-NEXT:    v_or_b32_e32 v1, v56, v1
-; GCN-NEXT:    v_or_b32_e32 v3, v57, v3
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 0x30000, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s6, v4
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, s6, v11
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, s6, v8
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, s6, v9
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, s6, v12
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, s6, v5
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, s6, v7
-; GCN-NEXT:    v_add_i32_e32 v50, vcc, s6, v1
-; GCN-NEXT:    v_add_i32_e32 v39, vcc, s6, v3
-; GCN-NEXT:    v_alignbit_b32 v41, v39, v50, 24
-; GCN-NEXT:    v_alignbit_b32 v43, v39, v50, 16
-; GCN-NEXT:    v_alignbit_b32 v44, v39, v50, 8
-; GCN-NEXT:    v_alignbit_b32 v53, v28, v32, 24
-; GCN-NEXT:    v_alignbit_b32 v55, v28, v32, 16
-; GCN-NEXT:    v_alignbit_b32 v42, v28, v32, 8
-; GCN-NEXT:    v_alignbit_b32 v37, v21, v23, 24
-; GCN-NEXT:    v_alignbit_b32 v48, v21, v23, 16
-; GCN-NEXT:    v_alignbit_b32 v52, v21, v23, 8
-; GCN-NEXT:    v_alignbit_b32 v31, v10, v14, 24
-; GCN-NEXT:    v_alignbit_b32 v33, v10, v14, 16
-; GCN-NEXT:    v_alignbit_b32 v35, v10, v14, 8
-; GCN-NEXT:    v_alignbit_b32 v25, v2, v6, 24
-; GCN-NEXT:    v_alignbit_b32 v27, v2, v6, 16
-; GCN-NEXT:    v_alignbit_b32 v30, v2, v6, 8
-; GCN-NEXT:    v_lshrrev_b32_e32 v54, 24, v39
-; GCN-NEXT:    v_lshrrev_b32_e32 v51, 16, v39
-; GCN-NEXT:    v_lshrrev_b32_e32 v47, 8, v39
-; GCN-NEXT:    v_lshrrev_b32_e32 v36, 24, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v34, 16, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v46, 8, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v29, 24, v21
-; GCN-NEXT:    v_lshrrev_b32_e32 v26, 16, v21
-; GCN-NEXT:    v_lshrrev_b32_e32 v45, 8, v21
-; GCN-NEXT:    v_lshrrev_b32_e32 v24, 24, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT:    v_lshrrev_b32_e32 v40, 8, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-NEXT:    v_lshrrev_b32_e32 v49, 8, v2
-; GCN-NEXT:  .LBB24_4: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v50
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 8, v44
-; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v43
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 24, v41
-; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v39
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 8, v47
-; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v51
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 24, v54
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
-; GCN-NEXT:    v_and_b32_e32 v15, 0xff, v32
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 8, v42
-; GCN-NEXT:    v_and_b32_e32 v17, 0xff, v55
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 24, v53
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
-; GCN-NEXT:    v_and_b32_e32 v19, 0xff, v28
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 8, v46
-; GCN-NEXT:    v_and_b32_e32 v22, 0xff, v34
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 24, v36
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 12, v0
-; GCN-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GCN-NEXT:    v_lshlrev_b32_e32 v34, 8, v52
-; GCN-NEXT:    v_and_b32_e32 v36, 0xff, v48
-; GCN-NEXT:    v_lshlrev_b32_e32 v37, 24, v37
-; GCN-NEXT:    v_add_i32_e32 v39, vcc, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v48, 8, v45
-; GCN-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GCN-NEXT:    v_lshlrev_b32_e32 v29, 24, v29
-; GCN-NEXT:    v_add_i32_e32 v50, vcc, 20, v0
-; GCN-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
-; GCN-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GCN-NEXT:    v_lshlrev_b32_e32 v31, 24, v31
-; GCN-NEXT:    v_add_i32_e32 v51, vcc, 24, v0
-; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v52, 8, v40
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v53, 0xff, v53
-; GCN-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
-; GCN-NEXT:    v_add_i32_e32 v54, vcc, 28, v0
-; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v30, 8, v30
-; GCN-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; GCN-NEXT:    v_lshlrev_b32_e32 v25, 24, v25
-; GCN-NEXT:    v_add_i32_e32 v55, vcc, 32, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v49, 8, v49
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v40, 0xff, v40
-; GCN-NEXT:    v_lshlrev_b32_e32 v38, 24, v38
-; GCN-NEXT:    v_add_i32_e32 v41, vcc, 36, v0
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
-; GCN-NEXT:    v_or_b32_e32 v7, v9, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GCN-NEXT:    v_or_b32_e32 v11, v15, v16
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v17
-; GCN-NEXT:    v_or_b32_e32 v15, v19, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v22
-; GCN-NEXT:    v_or_b32_e32 v17, v23, v34
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
-; GCN-NEXT:    v_or_b32_e32 v20, v21, v48
-; GCN-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v35
-; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v33
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v52
-; GCN-NEXT:    v_lshlrev_b32_e32 v23, 16, v53
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v30
-; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v49
-; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v40
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v8, v5
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v13, v9
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v11
-; GCN-NEXT:    v_or_b32_e32 v11, v18, v12
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v15
-; GCN-NEXT:    v_or_b32_e32 v13, v28, v16
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff, v17
-; GCN-NEXT:    v_or_b32_e32 v16, v37, v19
-; GCN-NEXT:    v_and_b32_e32 v17, 0xffff, v20
-; GCN-NEXT:    v_or_b32_e32 v18, v29, v21
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT:    v_or_b32_e32 v19, v31, v22
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GCN-NEXT:    v_or_b32_e32 v20, v24, v23
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_or_b32_e32 v21, v25, v26
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_or_b32_e32 v22, v38, v27
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v5
-; GCN-NEXT:    v_or_b32_e32 v5, v7, v8
-; GCN-NEXT:    v_or_b32_e32 v7, v9, v11
-; GCN-NEXT:    v_or_b32_e32 v8, v12, v13
-; GCN-NEXT:    v_or_b32_e32 v9, v15, v16
-; GCN-NEXT:    v_or_b32_e32 v11, v17, v18
-; GCN-NEXT:    v_or_b32_e32 v12, v14, v19
-; GCN-NEXT:    v_or_b32_e32 v10, v10, v20
+; GCN-NEXT:    v_or_b32_e32 v12, v16, v13
+; GCN-NEXT:    v_or_b32_e32 v13, v18, v19
+; GCN-NEXT:    v_or_b32_e32 v16, v24, v25
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v26
+; GCN-NEXT:    v_or_b32_e32 v4, v4, v27
+; GCN-NEXT:    v_or_b32_e32 v5, v5, v20
 ; GCN-NEXT:    v_or_b32_e32 v6, v6, v21
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v22
-; GCN-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v32, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v39, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v11, v50, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v12, v51, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v54, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v6, v55, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v2, v41, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GCN-NEXT:    v_or_b32_e32 v7, v7, v14
+; GCN-NEXT:    v_or_b32_e32 v8, v8, v15
+; GCN-NEXT:    v_or_b32_e32 v9, v9, v11
+; GCN-NEXT:    v_or_b32_e32 v10, v10, v12
+; GCN-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v51, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v49, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v50, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v7, v29, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v37, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v9, v23, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v20i16_to_v40i8:
+; VI-LABEL: bitcast_v10f32_to_v40i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v1
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr16
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr15
-; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr39
 ; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr14
 ; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr50
 ; VI-NEXT:    ; implicit-def: $vgpr36
-; VI-NEXT:    ; implicit-def: $vgpr13
-; VI-NEXT:    ; implicit-def: $vgpr49
 ; VI-NEXT:    ; implicit-def: $vgpr35
-; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr14
+; VI-NEXT:    ; implicit-def: $vgpr34
+; VI-NEXT:    ; implicit-def: $vgpr33
+; VI-NEXT:    ; implicit-def: $vgpr32
 ; VI-NEXT:    ; implicit-def: $vgpr31
-; VI-NEXT:    ; implicit-def: $vgpr12
-; VI-NEXT:    ; implicit-def: $vgpr53
 ; VI-NEXT:    ; implicit-def: $vgpr30
-; VI-NEXT:    ; implicit-def: $vgpr33
-; VI-NEXT:    ; implicit-def: $vgpr43
+; VI-NEXT:    ; implicit-def: $vgpr13
+; VI-NEXT:    ; implicit-def: $vgpr29
 ; VI-NEXT:    ; implicit-def: $vgpr28
-; VI-NEXT:    ; implicit-def: $vgpr11
-; VI-NEXT:    ; implicit-def: $vgpr42
 ; VI-NEXT:    ; implicit-def: $vgpr27
-; VI-NEXT:    ; implicit-def: $vgpr29
+; VI-NEXT:    ; implicit-def: $vgpr26
+; VI-NEXT:    ; implicit-def: $vgpr25
+; VI-NEXT:    ; implicit-def: $vgpr12
+; VI-NEXT:    ; implicit-def: $vgpr24
+; VI-NEXT:    ; implicit-def: $vgpr23
+; VI-NEXT:    ; implicit-def: $vgpr22
+; VI-NEXT:    ; implicit-def: $vgpr21
+; VI-NEXT:    ; implicit-def: $vgpr20
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr18
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB24_2
+; VI-NEXT:    s_cbranch_execz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
 ; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; VI-NEXT:    v_lshrrev_b32_e32 v29, 24, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 8, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v33, 24, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v39, 24, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v41, 24, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v1
 ; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v34, v1
-; VI-NEXT:    v_mov_b32_e32 v32, v2
-; VI-NEXT:    v_mov_b32_e32 v38, v3
-; VI-NEXT:    v_mov_b32_e32 v37, v4
-; VI-NEXT:    v_mov_b32_e32 v50, v5
-; VI-NEXT:    v_mov_b32_e32 v49, v6
-; VI-NEXT:    v_mov_b32_e32 v55, v7
-; VI-NEXT:    v_mov_b32_e32 v53, v8
-; VI-NEXT:    v_mov_b32_e32 v43, v9
-; VI-NEXT:    v_mov_b32_e32 v42, v10
-; VI-NEXT:    ; implicit-def: $vgpr1
-; VI-NEXT:    ; implicit-def: $vgpr3
-; VI-NEXT:    ; implicit-def: $vgpr5
-; VI-NEXT:    ; implicit-def: $vgpr7
-; VI-NEXT:    ; implicit-def: $vgpr9
-; VI-NEXT:  .LBB24_2: ; %Flow
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; VI-NEXT:  .LBB12_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB24_4
+; VI-NEXT:    s_cbranch_execz .LBB12_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v11, 3
-; VI-NEXT:    v_add_u16_sdwa v17, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v42, 3, v10
-; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; VI-NEXT:    v_add_u16_e32 v43, 3, v9
-; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v20
-; VI-NEXT:    v_add_u16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v53, 3, v8
-; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v18
-; VI-NEXT:    v_add_u16_e32 v55, 3, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
-; VI-NEXT:    v_or_b32_e32 v10, v42, v10
-; VI-NEXT:    v_or_b32_e32 v9, v43, v9
-; VI-NEXT:    v_add_u16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v49, 3, v6
-; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; VI-NEXT:    v_add_u16_e32 v50, 3, v5
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v24
-; VI-NEXT:    v_or_b32_e32 v8, v53, v8
-; VI-NEXT:    v_or_b32_e32 v7, v55, v7
+; VI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; VI-NEXT:    v_add_f32_e32 v7, 1.0, v7
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; VI-NEXT:    v_add_u16_e32 v37, 3, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v21
-; VI-NEXT:    v_add_u16_e32 v38, 3, v3
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v25
-; VI-NEXT:    v_or_b32_e32 v6, v49, v6
-; VI-NEXT:    v_or_b32_e32 v5, v50, v5
+; VI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; VI-NEXT:    v_add_f32_e32 v5, 1.0, v5
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; VI-NEXT:    v_add_u16_e32 v32, 3, v2
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
-; VI-NEXT:    v_add_u16_e32 v34, 3, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
-; VI-NEXT:    v_or_b32_e32 v4, v37, v4
-; VI-NEXT:    v_or_b32_e32 v3, v38, v3
+; VI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; VI-NEXT:    v_add_f32_e32 v3, 1.0, v3
 ; VI-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; VI-NEXT:    v_or_b32_e32 v2, v32, v2
-; VI-NEXT:    v_or_b32_e32 v1, v34, v1
+; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; VI-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
 ; VI-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; VI-NEXT:    v_lshrrev_b32_e32 v27, 8, v10
-; VI-NEXT:    v_lshrrev_b32_e32 v28, 8, v9
-; VI-NEXT:    v_lshrrev_b32_e32 v30, 8, v8
-; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v7
-; VI-NEXT:    v_lshrrev_b32_e32 v35, 8, v6
-; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v5
-; VI-NEXT:    v_lshrrev_b32_e32 v48, 8, v4
-; VI-NEXT:    v_lshrrev_b32_e32 v51, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v54, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v40, 8, v1
-; VI-NEXT:    v_bfe_u32 v29, v17, 8, 8
-; VI-NEXT:    v_bfe_u32 v33, v18, 8, 8
-; VI-NEXT:    v_bfe_u32 v39, v19, 8, 8
-; VI-NEXT:    v_bfe_u32 v52, v21, 8, 8
-; VI-NEXT:    v_bfe_u32 v41, v23, 8, 8
-; VI-NEXT:  .LBB24_4: ; %end
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; VI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; VI-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; VI-NEXT:  .LBB12_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v40
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v15
-; VI-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
+; VI-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
+; VI-NEXT:    v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v54
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v41
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v39
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v37
+; VI-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v51
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v14
-; VI-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v48
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v52
-; VI-NEXT:    v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v34
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v32
+; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v36
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v13
-; VI-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v35
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v39
-; VI-NEXT:    v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v29
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v27
+; VI-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v26
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
-; VI-NEXT:    v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v30
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v33
-; VI-NEXT:    v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v24
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v22
+; VI-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v28
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v21
 ; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
-; VI-NEXT:    v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
 ; VI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
-; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v29
-; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v19
+; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v17
+; VI-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
 ; VI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v20i16_to_v40i8:
+; GFX9-LABEL: bitcast_v10f32_to_v40i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
@@ -7531,7 +4364,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr11
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB24_2
+; GFX9-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -7563,23 +4396,23 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB24_2: ; %Flow
+; GFX9-NEXT:  .LBB12_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB24_4
+; GFX9-NEXT:    s_cbranch_execz .LBB12_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; GFX9-NEXT:    v_add_f32_e32 v7, 1.0, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, 1.0, v5
 ; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX9-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v3
 ; GFX9-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX9-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GFX9-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
 ; GFX9-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
@@ -7607,7 +4440,7 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB24_4: ; %end
+; GFX9-NEXT:  .LBB12_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -7672,227 +4505,403 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v20i16_to_v40i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB24_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB24_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v16
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v31, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v39
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v16
-; GFX11-NEXT:    v_or_b32_e32 v15, v48, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v14, v35, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v31
-; GFX11-NEXT:    v_or_b32_e32 v13, v30, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_or_b32_e32 v16, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v29
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v27
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v26
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v32
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v34
-; GFX11-NEXT:    v_or_b32_e32 v32, v33, v32
-; GFX11-NEXT:    v_or_b32_e32 v12, v25, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v16
-; GFX11-NEXT:    v_or_b32_e32 v11, v20, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v30
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v15
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v10f32_to_v40i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB12_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v31, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v48, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v35, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v38, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v33, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v25, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v20, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v15
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
 cmp.true:
-  %a1 = add <20 x i16> %a, splat (i16 3)
-  %a2 = bitcast <20 x i16> %a1 to <40 x i8>
+  %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
+  %a2 = bitcast <10 x float> %a1 to <40 x i8>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <20 x i16> %a to <40 x i8>
+  %a3 = bitcast <10 x float> %a to <40 x i8>
   br label %end
 
 end:
@@ -7900,166 +4909,131 @@ end:
   ret <40 x i8> %phi
 }
 
-define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
-; GCN-LABEL: bitcast_v40i8_to_v20i16:
+define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
+; GCN-LABEL: bitcast_v40i8_to_v10f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
-; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    v_mov_b32_e32 v35, v8
+; GCN-NEXT:    v_mov_b32_e32 v34, v6
+; GCN-NEXT:    v_mov_b32_e32 v33, v4
+; GCN-NEXT:    v_mov_b32_e32 v32, v2
+; GCN-NEXT:    v_mov_b32_e32 v31, v0
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:32
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16
 ; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:8
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
-; GCN-NEXT:    v_lshlrev_b32_e32 v53, 8, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v36, 24, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v37, 24, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v55, 8, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v38, 24, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v39, 24, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v40, 8, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v48, 24, v23
-; GCN-NEXT:    v_lshlrev_b32_e32 v49, 24, v19
-; GCN-NEXT:    v_lshlrev_b32_e32 v42, 8, v29
-; GCN-NEXT:    v_lshlrev_b32_e32 v50, 24, v27
-; GCN-NEXT:    v_lshlrev_b32_e32 v41, 8, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v43, 8, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v44, 8, v17
-; GCN-NEXT:    v_lshlrev_b32_e32 v45, 8, v25
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_lshlrev_b32_e32 v38, 8, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v36, 24, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v39, 8, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v37, 24, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v48, 8, v9
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v49, 8, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 24, v15
+; GCN-NEXT:    v_lshlrev_b32_e32 v50, 8, v17
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 24, v19
+; GCN-NEXT:    v_lshlrev_b32_e32 v51, 8, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v17, 24, v23
+; GCN-NEXT:    v_lshlrev_b32_e32 v52, 8, v25
+; GCN-NEXT:    v_lshlrev_b32_e32 v19, 24, v27
+; GCN-NEXT:    v_lshlrev_b32_e32 v27, 8, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v51
-; GCN-NEXT:    v_lshlrev_b32_e32 v51, 24, v35
-; GCN-NEXT:    v_lshlrev_b32_e32 v46, 8, v32
-; GCN-NEXT:    v_lshlrev_b32_e32 v52, 24, v31
-; GCN-NEXT:    v_lshlrev_b32_e32 v54, 24, v33
-; GCN-NEXT:    v_lshlrev_b32_e32 v47, 8, v34
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr1
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr5
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr7
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr9
-; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr33
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr34
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr35
-; GCN-NEXT:    ; implicit-def: $vgpr19
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v53
+; GCN-NEXT:    v_lshlrev_b32_e32 v21, 24, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v29, 8, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v23, 24, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v53, 8, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v25, 24, v0
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB25_2
+; GCN-NEXT:    s_cbranch_execz .LBB13_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; GCN-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v12
-; GCN-NEXT:    v_and_b32_e32 v5, 0xff, v14
-; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v22
-; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v18
-; GCN-NEXT:    v_and_b32_e32 v11, 0xff, v28
-; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v30
-; GCN-NEXT:    v_and_b32_e32 v13, 0xff, v26
-; GCN-NEXT:    v_and_b32_e32 v14, 0xff, v58
-; GCN-NEXT:    v_and_b32_e32 v15, 0xff, v57
-; GCN-NEXT:    v_and_b32_e32 v17, 0xff, v59
-; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GCN-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GCN-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v24
-; GCN-NEXT:    v_and_b32_e32 v19, 0xff, v56
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v53
+; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GCN-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GCN-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v12
+; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v14
+; GCN-NEXT:    v_and_b32_e32 v8, 0xff, v16
+; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v20
+; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v22
+; GCN-NEXT:    v_and_b32_e32 v14, 0xff, v24
+; GCN-NEXT:    v_and_b32_e32 v16, 0xff, v26
+; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v28
+; GCN-NEXT:    v_and_b32_e32 v20, 0xff, v30
+; GCN-NEXT:    v_and_b32_e32 v22, 0xff, v41
+; GCN-NEXT:    v_and_b32_e32 v24, 0xff, v40
+; GCN-NEXT:    v_and_b32_e32 v26, 0xff, v55
+; GCN-NEXT:    v_and_b32_e32 v28, 0xff, v54
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v38
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_or_b32_e32 v2, v2, v39
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v55
+; GCN-NEXT:    v_or_b32_e32 v4, v4, v48
 ; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v40
+; GCN-NEXT:    v_or_b32_e32 v6, v6, v49
+; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_or_b32_e32 v8, v8, v50
 ; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_or_b32_e32 v11, v11, v42
+; GCN-NEXT:    v_or_b32_e32 v10, v10, v51
 ; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_or_b32_e32 v14, v14, v46
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v41
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v43
-; GCN-NEXT:    v_or_b32_e32 v16, v16, v44
-; GCN-NEXT:    v_or_b32_e32 v18, v18, v45
-; GCN-NEXT:    v_or_b32_e32 v19, v19, v47
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_or_b32_e32 v3, v36, v3
-; GCN-NEXT:    v_or_b32_e32 v2, v37, v2
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v38, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v39, v6
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_or_b32_e32 v9, v48, v9
-; GCN-NEXT:    v_or_b32_e32 v10, v49, v10
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GCN-NEXT:    v_or_b32_e32 v12, v51, v12
-; GCN-NEXT:    v_or_b32_e32 v13, v50, v13
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GCN-NEXT:    v_or_b32_e32 v20, v52, v15
-; GCN-NEXT:    v_or_b32_e32 v17, v54, v17
+; GCN-NEXT:    v_or_b32_e32 v14, v14, v52
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT:    v_or_b32_e32 v18, v18, v27
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT:    v_or_b32_e32 v22, v22, v29
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT:    v_or_b32_e32 v26, v26, v53
+; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v36, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GCN-NEXT:    v_or_b32_e32 v3, v37, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    v_or_b32_e32 v7, v13, v7
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v18
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; GCN-NEXT:    v_or_b32_e32 v23, v1, v3
-; GCN-NEXT:    v_or_b32_e32 v27, v4, v5
-; GCN-NEXT:    v_or_b32_e32 v31, v7, v9
-; GCN-NEXT:    v_or_b32_e32 v33, v11, v12
-; GCN-NEXT:    v_or_b32_e32 v35, v14, v20
-; GCN-NEXT:    v_or_b32_e32 v21, v0, v2
-; GCN-NEXT:    v_or_b32_e32 v25, v8, v6
-; GCN-NEXT:    v_or_b32_e32 v29, v15, v10
-; GCN-NEXT:    v_or_b32_e32 v32, v16, v13
-; GCN-NEXT:    v_or_b32_e32 v34, v18, v17
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v12
-; GCN-NEXT:    v_alignbit_b32 v1, v23, v2, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v27, v6, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v31, v10, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v33, v13, 16
-; GCN-NEXT:    v_alignbit_b32 v17, v35, v17, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v20
-; GCN-NEXT:    ; implicit-def: $vgpr0
-; GCN-NEXT:    ; implicit-def: $vgpr2
-; GCN-NEXT:    ; implicit-def: $vgpr4
-; GCN-NEXT:    ; implicit-def: $vgpr6
-; GCN-NEXT:    ; implicit-def: $vgpr8
+; GCN-NEXT:    v_or_b32_e32 v9, v15, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT:    v_or_b32_e32 v11, v17, v12
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v14
+; GCN-NEXT:    v_or_b32_e32 v13, v19, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v18
+; GCN-NEXT:    v_or_b32_e32 v15, v21, v20
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v22
+; GCN-NEXT:    v_or_b32_e32 v17, v23, v24
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v26
+; GCN-NEXT:    v_or_b32_e32 v19, v25, v27
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_or_b32_e32 v1, v2, v3
+; GCN-NEXT:    v_or_b32_e32 v2, v4, v5
+; GCN-NEXT:    v_or_b32_e32 v3, v6, v7
+; GCN-NEXT:    v_or_b32_e32 v4, v8, v9
+; GCN-NEXT:    v_or_b32_e32 v5, v10, v11
+; GCN-NEXT:    v_or_b32_e32 v6, v12, v13
+; GCN-NEXT:    v_or_b32_e32 v7, v14, v15
+; GCN-NEXT:    v_or_b32_e32 v8, v16, v17
+; GCN-NEXT:    v_or_b32_e32 v9, v18, v19
+; GCN-NEXT:    ; implicit-def: $vgpr31
+; GCN-NEXT:    ; implicit-def: $vgpr32
+; GCN-NEXT:    ; implicit-def: $vgpr33
+; GCN-NEXT:    ; implicit-def: $vgpr34
+; GCN-NEXT:    ; implicit-def: $vgpr35
 ; GCN-NEXT:    ; implicit-def: $vgpr10
 ; GCN-NEXT:    ; implicit-def: $vgpr12
 ; GCN-NEXT:    ; implicit-def: $vgpr14
@@ -8071,283 +5045,247 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr26
 ; GCN-NEXT:    ; implicit-def: $vgpr28
 ; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr56
-; GCN-NEXT:    ; implicit-def: $vgpr59
-; GCN-NEXT:    ; implicit-def: $vgpr58
-; GCN-NEXT:    ; implicit-def: $vgpr57
-; GCN-NEXT:    ; implicit-def: $vgpr53
-; GCN-NEXT:    ; implicit-def: $vgpr36
-; GCN-NEXT:    ; implicit-def: $vgpr37
+; GCN-NEXT:    ; implicit-def: $vgpr41
+; GCN-NEXT:    ; implicit-def: $vgpr40
 ; GCN-NEXT:    ; implicit-def: $vgpr55
+; GCN-NEXT:    ; implicit-def: $vgpr54
 ; GCN-NEXT:    ; implicit-def: $vgpr38
+; GCN-NEXT:    ; implicit-def: $vgpr36
 ; GCN-NEXT:    ; implicit-def: $vgpr39
-; GCN-NEXT:    ; implicit-def: $vgpr40
+; GCN-NEXT:    ; implicit-def: $vgpr37
 ; GCN-NEXT:    ; implicit-def: $vgpr48
+; GCN-NEXT:    ; implicit-def: $vgpr11
 ; GCN-NEXT:    ; implicit-def: $vgpr49
-; GCN-NEXT:    ; implicit-def: $vgpr42
-; GCN-NEXT:    ; implicit-def: $vgpr51
+; GCN-NEXT:    ; implicit-def: $vgpr13
 ; GCN-NEXT:    ; implicit-def: $vgpr50
-; GCN-NEXT:    ; implicit-def: $vgpr46
+; GCN-NEXT:    ; implicit-def: $vgpr15
+; GCN-NEXT:    ; implicit-def: $vgpr51
+; GCN-NEXT:    ; implicit-def: $vgpr17
 ; GCN-NEXT:    ; implicit-def: $vgpr52
-; GCN-NEXT:    ; implicit-def: $vgpr54
-; GCN-NEXT:    ; implicit-def: $vgpr41
-; GCN-NEXT:    ; implicit-def: $vgpr43
-; GCN-NEXT:    ; implicit-def: $vgpr44
-; GCN-NEXT:    ; implicit-def: $vgpr45
-; GCN-NEXT:    ; implicit-def: $vgpr47
-; GCN-NEXT:  .LBB25_2: ; %Flow
+; GCN-NEXT:    ; implicit-def: $vgpr19
+; GCN-NEXT:    ; implicit-def: $vgpr27
+; GCN-NEXT:    ; implicit-def: $vgpr21
+; GCN-NEXT:    ; implicit-def: $vgpr29
+; GCN-NEXT:    ; implicit-def: $vgpr23
+; GCN-NEXT:    ; implicit-def: $vgpr53
+; GCN-NEXT:    ; implicit-def: $vgpr25
+; GCN-NEXT:  .LBB13_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB25_4
+; GCN-NEXT:    s_cbranch_execz .LBB13_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v56
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v31
 ; GCN-NEXT:    s_movk_i32 s6, 0x300
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v59
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v32
 ; GCN-NEXT:    s_mov_b32 s7, 0x3000000
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v58
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v57
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v24
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 3, v26
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 3, v28
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 3, v30
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
-; GCN-NEXT:    v_add_i32_e32 v17, vcc, 3, v18
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 3, v20
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 3, v22
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 3, v10
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 3, v12
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 3, v14
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v33
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v34
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v35
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v10
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v12
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v14
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v16
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v18
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 3, v20
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 3, v22
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 3, v24
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 3, v26
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 3, v28
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 3, v30
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 3, v41
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 3, v40
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 3, v55
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 3, v54
+; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GCN-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GCN-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GCN-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GCN-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GCN-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GCN-NEXT:    v_and_b32_e32 v19, 0xff, v19
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GCN-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GCN-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GCN-NEXT:    v_or_b32_e32 v1, v47, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GCN-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GCN-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GCN-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GCN-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GCN-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GCN-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GCN-NEXT:    v_or_b32_e32 v0, v38, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_or_b32_e32 v2, v39, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_or_b32_e32 v5, v46, v5
+; GCN-NEXT:    v_or_b32_e32 v4, v48, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_or_b32_e32 v6, v49, v6
 ; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_or_b32_e32 v9, v45, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_or_b32_e32 v13, v42, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_or_b32_e32 v16, v44, v16
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_or_b32_e32 v18, v40, v18
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    v_or_b32_e32 v8, v43, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_or_b32_e32 v12, v55, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GCN-NEXT:    v_or_b32_e32 v0, v41, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_or_b32_e32 v4, v53, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; GCN-NEXT:    v_or_b32_e32 v3, v54, v3
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, s6, v5
-; GCN-NEXT:    v_or_b32_e32 v7, v52, v7
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, s6, v9
-; GCN-NEXT:    v_or_b32_e32 v11, v50, v11
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, s6, v13
-; GCN-NEXT:    v_or_b32_e32 v15, v51, v15
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, s6, v16
-; GCN-NEXT:    v_or_b32_e32 v17, v49, v17
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, s6, v18
-; GCN-NEXT:    v_or_b32_e32 v19, v48, v19
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, s6, v8
-; GCN-NEXT:    v_or_b32_e32 v10, v39, v10
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, s6, v12
-; GCN-NEXT:    v_or_b32_e32 v14, v38, v14
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT:    v_or_b32_e32 v2, v37, v2
+; GCN-NEXT:    v_or_b32_e32 v8, v50, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_or_b32_e32 v10, v51, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_or_b32_e32 v14, v52, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT:    v_or_b32_e32 v18, v27, v18
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT:    v_or_b32_e32 v22, v29, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT:    v_or_b32_e32 v26, v53, v26
+; GCN-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v36, v1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT:    v_or_b32_e32 v3, v37, v3
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, s6, v4
-; GCN-NEXT:    v_or_b32_e32 v6, v36, v6
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT:    v_or_b32_e32 v5, v11, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, s6, v6
+; GCN-NEXT:    v_or_b32_e32 v7, v13, v7
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, s6, v8
+; GCN-NEXT:    v_or_b32_e32 v9, v15, v9
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, s6, v10
+; GCN-NEXT:    v_or_b32_e32 v11, v17, v12
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, s6, v14
+; GCN-NEXT:    v_or_b32_e32 v13, v19, v16
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, s6, v18
+; GCN-NEXT:    v_or_b32_e32 v15, v21, v20
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, s6, v22
+; GCN-NEXT:    v_or_b32_e32 v17, v23, v24
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 0x300, v26
+; GCN-NEXT:    v_or_b32_e32 v19, v25, v27
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_or_b32_e32 v1, v3, v1
-; GCN-NEXT:    v_or_b32_e32 v3, v7, v5
-; GCN-NEXT:    v_or_b32_e32 v5, v11, v9
-; GCN-NEXT:    v_or_b32_e32 v7, v15, v13
-; GCN-NEXT:    v_or_b32_e32 v9, v17, v16
-; GCN-NEXT:    v_or_b32_e32 v11, v19, v18
-; GCN-NEXT:    v_or_b32_e32 v8, v10, v8
-; GCN-NEXT:    v_or_b32_e32 v10, v14, v12
-; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
-; GCN-NEXT:    v_or_b32_e32 v2, v6, v4
-; GCN-NEXT:    v_add_i32_e32 v34, vcc, s7, v1
-; GCN-NEXT:    v_add_i32_e32 v35, vcc, s7, v3
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, s7, v5
-; GCN-NEXT:    v_add_i32_e32 v33, vcc, s7, v7
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, s7, v9
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, s7, v11
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, s7, v8
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, s7, v10
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, s7, v0
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, s7, v2
-; GCN-NEXT:    v_alignbit_b32 v1, v23, v21, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v27, v25, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v31, v29, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v33, v32, 16
-; GCN-NEXT:    v_alignbit_b32 v17, v35, v34, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v33
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v35
-; GCN-NEXT:  .LBB25_4: ; %end
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v3, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v5, v4
+; GCN-NEXT:    v_or_b32_e32 v3, v7, v6
+; GCN-NEXT:    v_or_b32_e32 v4, v9, v8
+; GCN-NEXT:    v_or_b32_e32 v5, v11, v10
+; GCN-NEXT:    v_or_b32_e32 v6, v13, v12
+; GCN-NEXT:    v_or_b32_e32 v7, v15, v14
+; GCN-NEXT:    v_or_b32_e32 v8, v17, v16
+; GCN-NEXT:    v_or_b32_e32 v9, v19, v18
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s7, v0
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, s7, v1
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, s7, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, s7, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, s7, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, s7, v6
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x3000000, v8
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v9
+; GCN-NEXT:  .LBB13_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, v21
-; GCN-NEXT:    v_mov_b32_e32 v2, v23
-; GCN-NEXT:    v_mov_b32_e32 v4, v25
-; GCN-NEXT:    v_mov_b32_e32 v6, v27
-; GCN-NEXT:    v_mov_b32_e32 v8, v29
-; GCN-NEXT:    v_mov_b32_e32 v10, v31
-; GCN-NEXT:    v_mov_b32_e32 v12, v32
-; GCN-NEXT:    v_mov_b32_e32 v14, v33
-; GCN-NEXT:    v_mov_b32_e32 v16, v34
-; GCN-NEXT:    v_mov_b32_e32 v18, v35
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v40i8_to_v20i16:
+; VI-LABEL: bitcast_v40i8_to_v10f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; VI-NEXT:    v_mov_b32_e32 v34, v10
-; VI-NEXT:    v_mov_b32_e32 v33, v8
-; VI-NEXT:    v_mov_b32_e32 v35, v6
-; VI-NEXT:    v_mov_b32_e32 v38, v4
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; VI-NEXT:    v_mov_b32_e32 v35, v8
+; VI-NEXT:    v_mov_b32_e32 v34, v6
+; VI-NEXT:    v_mov_b32_e32 v33, v4
 ; VI-NEXT:    v_mov_b32_e32 v32, v2
-; VI-NEXT:    v_mov_b32_e32 v36, v0
+; VI-NEXT:    v_mov_b32_e32 v31, v0
 ; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:16
 ; VI-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_ushort v4, off, s[0:3], s32
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
 ; VI-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:32
-; VI-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:28
-; VI-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:24
-; VI-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_ushort v51, off, s[0:3], s32 offset:4
-; VI-NEXT:    v_mov_b32_e32 v31, v14
-; VI-NEXT:    v_mov_b32_e32 v37, v12
-; VI-NEXT:    v_lshlrev_b16_e32 v39, 8, v1
-; VI-NEXT:    v_lshlrev_b16_e32 v48, 8, v3
-; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v5
-; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v7
-; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v9
-; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v11
-; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v13
-; VI-NEXT:    v_lshlrev_b16_e32 v42, 8, v15
-; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
-; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v19
-; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
+; VI-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v1
+; VI-NEXT:    v_lshlrev_b16_e32 v42, 8, v3
+; VI-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
+; VI-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
+; VI-NEXT:    v_lshlrev_b16_e32 v55, 8, v9
+; VI-NEXT:    v_lshlrev_b16_e32 v54, 8, v11
+; VI-NEXT:    v_lshlrev_b16_e32 v53, 8, v13
+; VI-NEXT:    v_lshlrev_b16_e32 v52, 8, v15
+; VI-NEXT:    v_lshlrev_b16_e32 v51, 8, v17
+; VI-NEXT:    v_lshlrev_b16_e32 v50, 8, v19
+; VI-NEXT:    v_lshlrev_b16_e32 v49, 8, v21
+; VI-NEXT:    v_lshlrev_b16_e32 v48, 8, v23
 ; VI-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
-; VI-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
-; VI-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; VI-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
+; VI-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
 ; VI-NEXT:    s_waitcnt vmcnt(9)
-; VI-NEXT:    v_lshlrev_b16_e32 v45, 8, v0
+; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(8)
-; VI-NEXT:    v_lshlrev_b16_e32 v44, 8, v2
+; VI-NEXT:    v_lshlrev_b16_e32 v17, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(7)
-; VI-NEXT:    v_lshlrev_b16_e32 v43, 8, v4
+; VI-NEXT:    v_lshlrev_b16_e32 v19, 8, v4
 ; VI-NEXT:    s_waitcnt vmcnt(6)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; VI-NEXT:    s_waitcnt vmcnt(5)
-; VI-NEXT:    v_lshlrev_b16_e32 v47, 8, v8
+; VI-NEXT:    v_lshlrev_b16_e32 v11, 8, v8
+; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_lshlrev_b16_e32 v46, 8, v10
-; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; VI-NEXT:    v_lshlrev_b16_e32 v13, 8, v44
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB25_2
+; VI-NEXT:    s_cbranch_execz .LBB13_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
-; VI-NEXT:    v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v35, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v2, v33, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v3, v37, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v31, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v9, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v10, v55, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr31
 ; VI-NEXT:    ; implicit-def: $vgpr32
-; VI-NEXT:    ; implicit-def: $vgpr38
-; VI-NEXT:    ; implicit-def: $vgpr35
 ; VI-NEXT:    ; implicit-def: $vgpr33
 ; VI-NEXT:    ; implicit-def: $vgpr34
-; VI-NEXT:    ; implicit-def: $vgpr37
-; VI-NEXT:    ; implicit-def: $vgpr31
+; VI-NEXT:    ; implicit-def: $vgpr35
+; VI-NEXT:    ; implicit-def: $vgpr10
+; VI-NEXT:    ; implicit-def: $vgpr12
+; VI-NEXT:    ; implicit-def: $vgpr14
 ; VI-NEXT:    ; implicit-def: $vgpr16
 ; VI-NEXT:    ; implicit-def: $vgpr18
 ; VI-NEXT:    ; implicit-def: $vgpr20
@@ -8356,222 +5294,210 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr26
 ; VI-NEXT:    ; implicit-def: $vgpr28
 ; VI-NEXT:    ; implicit-def: $vgpr30
-; VI-NEXT:    ; implicit-def: $vgpr51
-; VI-NEXT:    ; implicit-def: $vgpr53
-; VI-NEXT:    ; implicit-def: $vgpr54
-; VI-NEXT:    ; implicit-def: $vgpr55
 ; VI-NEXT:    ; implicit-def: $vgpr39
-; VI-NEXT:    ; implicit-def: $vgpr48
-; VI-NEXT:    ; implicit-def: $vgpr49
-; VI-NEXT:    ; implicit-def: $vgpr50
-; VI-NEXT:    ; implicit-def: $vgpr52
-; VI-NEXT:    ; implicit-def: $vgpr40
-; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr38
+; VI-NEXT:    ; implicit-def: $vgpr37
+; VI-NEXT:    ; implicit-def: $vgpr36
+; VI-NEXT:    ; implicit-def: $vgpr43
 ; VI-NEXT:    ; implicit-def: $vgpr42
-; VI-NEXT:    ; implicit-def: $vgpr17
-; VI-NEXT:    ; implicit-def: $vgpr19
-; VI-NEXT:    ; implicit-def: $vgpr21
-; VI-NEXT:    ; implicit-def: $vgpr23
+; VI-NEXT:    ; implicit-def: $vgpr41
+; VI-NEXT:    ; implicit-def: $vgpr40
+; VI-NEXT:    ; implicit-def: $vgpr55
+; VI-NEXT:    ; implicit-def: $vgpr54
+; VI-NEXT:    ; implicit-def: $vgpr53
+; VI-NEXT:    ; implicit-def: $vgpr52
+; VI-NEXT:    ; implicit-def: $vgpr51
+; VI-NEXT:    ; implicit-def: $vgpr50
+; VI-NEXT:    ; implicit-def: $vgpr49
+; VI-NEXT:    ; implicit-def: $vgpr48
 ; VI-NEXT:    ; implicit-def: $vgpr25
-; VI-NEXT:    ; implicit-def: $vgpr27
-; VI-NEXT:    ; implicit-def: $vgpr29
-; VI-NEXT:    ; implicit-def: $vgpr43
-; VI-NEXT:    ; implicit-def: $vgpr44
-; VI-NEXT:    ; implicit-def: $vgpr45
-; VI-NEXT:    ; implicit-def: $vgpr46
-; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:  .LBB25_2: ; %Flow
+; VI-NEXT:    ; implicit-def: $vgpr23
+; VI-NEXT:    ; implicit-def: $vgpr21
+; VI-NEXT:    ; implicit-def: $vgpr19
+; VI-NEXT:    ; implicit-def: $vgpr17
+; VI-NEXT:    ; implicit-def: $vgpr15
+; VI-NEXT:    ; implicit-def: $vgpr13
+; VI-NEXT:    ; implicit-def: $vgpr11
+; VI-NEXT:  .LBB13_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB25_4
+; VI-NEXT:    s_cbranch_execz .LBB13_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
-; VI-NEXT:    v_add_u16_e32 v0, 3, v55
-; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_mov_b32_e32 v1, 0x300
-; VI-NEXT:    v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_add_u16_e32 v0, 3, v54
-; VI-NEXT:    v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_add_u16_e32 v0, 3, v53
-; VI-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u16_e32 v0, 3, v51
-; VI-NEXT:    v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, 3, v30
-; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v7, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v0, 3, v28
-; VI-NEXT:    v_or_b32_sdwa v12, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, 3, v26
-; VI-NEXT:    v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v6, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v0, 3, v24
-; VI-NEXT:    v_or_b32_sdwa v13, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, 3, v22
-; VI-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v0, 3, v20
-; VI-NEXT:    v_or_b32_sdwa v14, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, 3, v18
-; VI-NEXT:    v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v0, 3, v16
-; VI-NEXT:    v_or_b32_sdwa v15, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v31
-; VI-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v0, 3, v37
-; VI-NEXT:    v_or_b32_sdwa v16, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, 3, v34
-; VI-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v0, 3, v33
-; VI-NEXT:    v_or_b32_sdwa v17, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, 3, v35
-; VI-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v18, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v0, 3, v38
-; VI-NEXT:    v_or_b32_sdwa v19, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_e32 v0, 3, v32
-; VI-NEXT:    v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_add_u16_e32 v1, 3, v36
-; VI-NEXT:    v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v1, 3, v32
+; VI-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_mov_b32_e32 v9, 0x300
+; VI-NEXT:    v_add_u16_e32 v0, 0x300, v0
+; VI-NEXT:    v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_add_u16_e32 v1, 3, v33
+; VI-NEXT:    v_add_u16_e32 v2, 3, v34
+; VI-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v1, 0x300, v1
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_add_u16_e32 v1, 0x300, v19
-; VI-NEXT:    v_add_u16_e32 v17, 0x300, v17
-; VI-NEXT:    v_add_u16_e32 v16, 0x300, v16
-; VI-NEXT:    v_add_u16_e32 v15, 0x300, v15
-; VI-NEXT:    v_add_u16_e32 v14, 0x300, v14
-; VI-NEXT:    v_add_u16_e32 v13, 0x300, v13
-; VI-NEXT:    v_add_u16_e32 v12, 0x300, v12
-; VI-NEXT:    v_add_u16_e32 v11, 0x300, v11
+; VI-NEXT:    v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_add_u16_e32 v2, 3, v35
+; VI-NEXT:    v_add_u16_e32 v3, 3, v10
+; VI-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v2, 0x300, v2
+; VI-NEXT:    v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v2, v2, v3
+; VI-NEXT:    v_add_u16_e32 v3, 3, v12
+; VI-NEXT:    v_add_u16_e32 v4, 3, v14
+; VI-NEXT:    v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v3, 0x300, v3
+; VI-NEXT:    v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v3, v3, v4
+; VI-NEXT:    v_add_u16_e32 v4, 3, v16
+; VI-NEXT:    v_add_u16_e32 v5, 3, v18
+; VI-NEXT:    v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v4, 0x300, v4
+; VI-NEXT:    v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_add_u16_e32 v5, 3, v20
+; VI-NEXT:    v_add_u16_e32 v6, 3, v22
+; VI-NEXT:    v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v5, 0x300, v5
+; VI-NEXT:    v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v5, v5, v6
+; VI-NEXT:    v_add_u16_e32 v6, 3, v24
+; VI-NEXT:    v_add_u16_e32 v7, 3, v26
+; VI-NEXT:    v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v6, 0x300, v6
+; VI-NEXT:    v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v6, v6, v7
+; VI-NEXT:    v_add_u16_e32 v7, 3, v28
+; VI-NEXT:    v_add_u16_e32 v8, 3, v30
+; VI-NEXT:    v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v7, 0x300, v7
+; VI-NEXT:    v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v7, v7, v8
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_u16_e32 v8, 3, v39
+; VI-NEXT:    v_add_u16_e32 v10, 3, v38
+; VI-NEXT:    v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_add_u16_e32 v8, 0x300, v8
+; VI-NEXT:    v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v8, v8, v10
+; VI-NEXT:    v_add_u16_e32 v10, 3, v37
+; VI-NEXT:    v_add_u16_e32 v12, 3, v36
+; VI-NEXT:    v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v10, 0x300, v10
-; VI-NEXT:    v_or_b32_e32 v1, v1, v18
-; VI-NEXT:    v_or_b32_e32 v2, v17, v2
-; VI-NEXT:    v_or_b32_e32 v3, v16, v3
-; VI-NEXT:    v_or_b32_e32 v4, v15, v4
-; VI-NEXT:    v_or_b32_e32 v5, v14, v5
-; VI-NEXT:    v_or_b32_e32 v6, v13, v6
-; VI-NEXT:    v_or_b32_e32 v7, v12, v7
-; VI-NEXT:    v_or_b32_e32 v8, v11, v8
+; VI-NEXT:    v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:  .LBB25_4: ; %end
+; VI-NEXT:  .LBB13_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v40i8_to_v20i16:
+; GFX9-LABEL: bitcast_v40i8_to_v10f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v31, v10
-; GFX9-NEXT:    v_mov_b32_e32 v32, v8
-; GFX9-NEXT:    v_mov_b32_e32 v38, v6
-; GFX9-NEXT:    v_mov_b32_e32 v35, v4
-; GFX9-NEXT:    v_mov_b32_e32 v33, v2
-; GFX9-NEXT:    v_mov_b32_e32 v36, v0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v35, v8
+; GFX9-NEXT:    v_mov_b32_e32 v34, v6
+; GFX9-NEXT:    v_mov_b32_e32 v33, v4
+; GFX9-NEXT:    v_mov_b32_e32 v32, v2
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 offset:16
 ; GFX9-NEXT:    buffer_load_ushort v2, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_ushort v4, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_load_ushort v54, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_load_ushort v10, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_load_ushort v42, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_ushort v53, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_ushort v55, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    v_mov_b32_e32 v37, v14
-; GFX9-NEXT:    v_mov_b32_e32 v34, v12
-; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v39, 8, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v50, 8, v5
-; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v7
-; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v9
-; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v11
-; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v13
-; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v15
-; GFX9-NEXT:    v_lshlrev_b16_e32 v43, 8, v17
-; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v19
-; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
-; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v23
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ushort v8, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ushort v36, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_ushort v44, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ushort v37, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ushort v38, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v39, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v43, 8, v1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v42, 8, v3
+; GFX9-NEXT:    v_lshlrev_b16_e32 v41, 8, v5
+; GFX9-NEXT:    v_lshlrev_b16_e32 v40, 8, v7
+; GFX9-NEXT:    v_lshlrev_b16_e32 v55, 8, v9
+; GFX9-NEXT:    v_lshlrev_b16_e32 v54, 8, v11
+; GFX9-NEXT:    v_lshlrev_b16_e32 v53, 8, v13
+; GFX9-NEXT:    v_lshlrev_b16_e32 v52, 8, v15
+; GFX9-NEXT:    v_lshlrev_b16_e32 v51, 8, v17
+; GFX9-NEXT:    v_lshlrev_b16_e32 v50, 8, v19
+; GFX9-NEXT:    v_lshlrev_b16_e32 v49, 8, v21
+; GFX9-NEXT:    v_lshlrev_b16_e32 v48, 8, v23
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v23, 8, v27
-; GFX9-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX9-NEXT:    v_lshlrev_b16_e32 v21, 8, v29
 ; GFX9-NEXT:    s_waitcnt vmcnt(9)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v44, 8, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v0
 ; GFX9-NEXT:    s_waitcnt vmcnt(8)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v45, 8, v2
+; GFX9-NEXT:    v_lshlrev_b16_e32 v17, 8, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(7)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v27, 8, v4
+; GFX9-NEXT:    v_lshlrev_b16_e32 v19, 8, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(6)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v46, 8, v8
+; GFX9-NEXT:    v_lshlrev_b16_e32 v11, 8, v8
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
 ; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v47, 8, v10
-; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v44
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB25_2
+; GFX9-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
-; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s6
-; GFX9-NEXT:    v_or_b32_sdwa v1, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s6
-; GFX9-NEXT:    v_or_b32_sdwa v2, v32, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v3, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v2, v3, v2, s6
-; GFX9-NEXT:    v_or_b32_sdwa v3, v34, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v4, v37, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v3, v4, v3, s6
-; GFX9-NEXT:    v_or_b32_sdwa v4, v16, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v5, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s6
-; GFX9-NEXT:    v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s6
+; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s6
-; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s6
+; GFX9-NEXT:    v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_or_b32_sdwa v8, v55, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v9, v53, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s6
-; GFX9-NEXT:    v_or_b32_sdwa v9, v42, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v10, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s6
-; GFX9-NEXT:    ; implicit-def: $vgpr36
-; GFX9-NEXT:    ; implicit-def: $vgpr33
-; GFX9-NEXT:    ; implicit-def: $vgpr35
-; GFX9-NEXT:    ; implicit-def: $vgpr38
-; GFX9-NEXT:    ; implicit-def: $vgpr32
+; GFX9-NEXT:    v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    ; implicit-def: $vgpr31
+; GFX9-NEXT:    ; implicit-def: $vgpr32
+; GFX9-NEXT:    ; implicit-def: $vgpr33
 ; GFX9-NEXT:    ; implicit-def: $vgpr34
-; GFX9-NEXT:    ; implicit-def: $vgpr37
+; GFX9-NEXT:    ; implicit-def: $vgpr35
+; GFX9-NEXT:    ; implicit-def: $vgpr10
+; GFX9-NEXT:    ; implicit-def: $vgpr12
+; GFX9-NEXT:    ; implicit-def: $vgpr14
 ; GFX9-NEXT:    ; implicit-def: $vgpr16
 ; GFX9-NEXT:    ; implicit-def: $vgpr18
 ; GFX9-NEXT:    ; implicit-def: $vgpr20
@@ -8580,575 +5506,782 @@ define <20 x i16> @bitcast_v40i8_to_v20i16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr26
 ; GFX9-NEXT:    ; implicit-def: $vgpr28
 ; GFX9-NEXT:    ; implicit-def: $vgpr30
-; GFX9-NEXT:    ; implicit-def: $vgpr55
-; GFX9-NEXT:    ; implicit-def: $vgpr53
+; GFX9-NEXT:    ; implicit-def: $vgpr39
+; GFX9-NEXT:    ; implicit-def: $vgpr38
+; GFX9-NEXT:    ; implicit-def: $vgpr37
+; GFX9-NEXT:    ; implicit-def: $vgpr36
+; GFX9-NEXT:    ; implicit-def: $vgpr43
 ; GFX9-NEXT:    ; implicit-def: $vgpr42
+; GFX9-NEXT:    ; implicit-def: $vgpr41
+; GFX9-NEXT:    ; implicit-def: $vgpr40
+; GFX9-NEXT:    ; implicit-def: $vgpr55
 ; GFX9-NEXT:    ; implicit-def: $vgpr54
-; GFX9-NEXT:    ; implicit-def: $vgpr48
-; GFX9-NEXT:    ; implicit-def: $vgpr39
-; GFX9-NEXT:    ; implicit-def: $vgpr50
-; GFX9-NEXT:    ; implicit-def: $vgpr49
+; GFX9-NEXT:    ; implicit-def: $vgpr53
 ; GFX9-NEXT:    ; implicit-def: $vgpr52
 ; GFX9-NEXT:    ; implicit-def: $vgpr51
-; GFX9-NEXT:    ; implicit-def: $vgpr41
-; GFX9-NEXT:    ; implicit-def: $vgpr40
-; GFX9-NEXT:    ; implicit-def: $vgpr43
-; GFX9-NEXT:    ; implicit-def: $vgpr17
-; GFX9-NEXT:    ; implicit-def: $vgpr21
-; GFX9-NEXT:    ; implicit-def: $vgpr19
+; GFX9-NEXT:    ; implicit-def: $vgpr50
+; GFX9-NEXT:    ; implicit-def: $vgpr49
+; GFX9-NEXT:    ; implicit-def: $vgpr48
 ; GFX9-NEXT:    ; implicit-def: $vgpr25
 ; GFX9-NEXT:    ; implicit-def: $vgpr23
-; GFX9-NEXT:    ; implicit-def: $vgpr29
-; GFX9-NEXT:    ; implicit-def: $vgpr27
-; GFX9-NEXT:    ; implicit-def: $vgpr45
-; GFX9-NEXT:    ; implicit-def: $vgpr44
-; GFX9-NEXT:    ; implicit-def: $vgpr47
-; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:  .LBB25_2: ; %Flow
+; GFX9-NEXT:    ; implicit-def: $vgpr21
+; GFX9-NEXT:    ; implicit-def: $vgpr19
+; GFX9-NEXT:    ; implicit-def: $vgpr17
+; GFX9-NEXT:    ; implicit-def: $vgpr15
+; GFX9-NEXT:    ; implicit-def: $vgpr13
+; GFX9-NEXT:    ; implicit-def: $vgpr11
+; GFX9-NEXT:  .LBB13_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB25_4
+; GFX9-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v42
-; GFX9-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v9, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v54
-; GFX9-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v10, 0x300, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v55
-; GFX9-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v8, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v53
-; GFX9-NEXT:    v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v11, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v28
-; GFX9-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v30
-; GFX9-NEXT:    v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v12, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v24
-; GFX9-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v6, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v26
-; GFX9-NEXT:    v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v13, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v20
-; GFX9-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v5, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v22
-; GFX9-NEXT:    v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v14, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v16
-; GFX9-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v4, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v18
-; GFX9-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v15, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v34
-; GFX9-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v3, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v37
-; GFX9-NEXT:    v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v16, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v32
-; GFX9-NEXT:    v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v2, 0x300, v0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 3, v31
-; GFX9-NEXT:    v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v17, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v35
-; GFX9-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v1, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v38
-; GFX9-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_add_u16_e32 v18, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, 3, v36
-; GFX9-NEXT:    v_add_u16_e32 v19, 3, v33
-; GFX9-NEXT:    v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_sdwa v19, v39, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v1, 3, v32
+; GFX9-NEXT:    v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    s_movk_i32 s6, 0x300
+; GFX9-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_u16_e32 v0, 0x300, v0
-; GFX9-NEXT:    v_add_u16_e32 v19, 0x300, v19
-; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v19, v0, s6
-; GFX9-NEXT:    v_perm_b32 v1, v18, v1, s6
-; GFX9-NEXT:    v_perm_b32 v2, v17, v2, s6
-; GFX9-NEXT:    v_perm_b32 v3, v16, v3, s6
-; GFX9-NEXT:    v_perm_b32 v4, v15, v4, s6
-; GFX9-NEXT:    v_perm_b32 v5, v14, v5, s6
-; GFX9-NEXT:    v_perm_b32 v6, v13, v6, s6
-; GFX9-NEXT:    v_perm_b32 v7, v12, v7, s6
-; GFX9-NEXT:    v_perm_b32 v8, v11, v8, s6
-; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s6
-; GFX9-NEXT:  .LBB25_4: ; %end
+; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_add_u16_e32 v1, 3, v33
+; GFX9-NEXT:    v_add_u16_e32 v2, 3, v34
+; GFX9-NEXT:    v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v1, 0x300, v1
+; GFX9-NEXT:    v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_add_u16_e32 v2, 3, v35
+; GFX9-NEXT:    v_add_u16_e32 v3, 3, v10
+; GFX9-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v2, 0x300, v2
+; GFX9-NEXT:    v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX9-NEXT:    v_add_u16_e32 v3, 3, v12
+; GFX9-NEXT:    v_add_u16_e32 v4, 3, v14
+; GFX9-NEXT:    v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v3, 0x300, v3
+; GFX9-NEXT:    v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX9-NEXT:    v_add_u16_e32 v4, 3, v16
+; GFX9-NEXT:    v_add_u16_e32 v5, 3, v18
+; GFX9-NEXT:    v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v4, 0x300, v4
+; GFX9-NEXT:    v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX9-NEXT:    v_add_u16_e32 v5, 3, v20
+; GFX9-NEXT:    v_add_u16_e32 v6, 3, v22
+; GFX9-NEXT:    v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v5, 0x300, v5
+; GFX9-NEXT:    v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX9-NEXT:    v_add_u16_e32 v6, 3, v24
+; GFX9-NEXT:    v_add_u16_e32 v7, 3, v26
+; GFX9-NEXT:    v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v6, 0x300, v6
+; GFX9-NEXT:    v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT:    v_add_u16_e32 v7, 3, v28
+; GFX9-NEXT:    v_add_u16_e32 v8, 3, v30
+; GFX9-NEXT:    v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v7, 0x300, v7
+; GFX9-NEXT:    v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u16_e32 v8, 3, v39
+; GFX9-NEXT:    v_add_u16_e32 v9, 3, v38
+; GFX9-NEXT:    v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v8, 0x300, v8
+; GFX9-NEXT:    v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX9-NEXT:    v_add_u16_e32 v9, 3, v37
+; GFX9-NEXT:    v_add_u16_e32 v10, 3, v36
+; GFX9-NEXT:    v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_u16_e32 v9, 0x300, v9
+; GFX9-NEXT:    v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX9-NEXT:  .LBB13_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v40i8_to_v20i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v10 :: v_dual_mov_b32 v34, v8
-; GFX11-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
-; GFX11-NEXT:    s_clause 0x9
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:4
-; GFX11-NEXT:    v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b16 v70, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_4
-; GFX11-NEXT:  .LBB25_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB25_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v48
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v49
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v51
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v50
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v52
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v54
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v17
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v66
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v25
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v69
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v70
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v71
-; GFX11-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v14, v13, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB25_2
-; GFX11-NEXT:  .LBB25_4: ; %cmp.true
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v0, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v66, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_nc_u16 v3, v65, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v28, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v2, v67, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-NEXT:    v_or_b32_e32 v3, v29, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v27, v2
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v69, v4
-; GFX11-NEXT:    v_add_nc_u16 v1, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v26, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, v24, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v23, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_perm_b32 v8, v11, v8, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v25, v0
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v21, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v64, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v19, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v31, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v54, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v53, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v17, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v52, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v33, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v55, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-NEXT:    v_or_b32_e32 v16, v48, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v49, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v50, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v0
-; GFX11-NEXT:    v_perm_b32 v0, v17, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v18, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v19, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v21, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v14, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v13, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v12, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v10, v9, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v10f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x9
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v27.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v36
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-TRUE16-NEXT:  .LBB13_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-TRUE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v26.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v25.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v25.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v23.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v22.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v23.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v24.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v15.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v15.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v17.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v16.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v17.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v18.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v13.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v14.h, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v13.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v14.l, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v17
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v10.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v11.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v11.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v17, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v10f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x9
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v36, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v37, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v38, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-FAKE16-NEXT:  .LBB13_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-FAKE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v10, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v12, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v18, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v53, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v54, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v55, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v64, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v65, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v48, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v49, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v50, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v51, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v52, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v39, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v36, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v21, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v23, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v25, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v27, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v29, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v13, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v15, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v17, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v19, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
 cmp.true:
   %a1 = add <40 x i8> %a, splat (i8 3)
-  %a2 = bitcast <40 x i8> %a1 to <20 x i16>
+  %a2 = bitcast <40 x i8> %a1 to <10 x float>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <40 x i8> %a to <20 x i16>
+  %a3 = bitcast <40 x i8> %a to <10 x float>
   br label %end
 
 end:
-  %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x i16> %phi
+  %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <10 x float> %phi
 }
 
-define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20i16_to_v5f64:
+define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) {
+; GCN-LABEL: bitcast_v10f32_to_v5f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v28, v14
-; GCN-NEXT:    v_mov_b32_e32 v27, v12
-; GCN-NEXT:    v_mov_b32_e32 v26, v10
-; GCN-NEXT:    v_mov_b32_e32 v21, v8
-; GCN-NEXT:    v_mov_b32_e32 v22, v6
-; GCN-NEXT:    v_mov_b32_e32 v23, v4
-; GCN-NEXT:    v_mov_b32_e32 v24, v2
-; GCN-NEXT:    v_mov_b32_e32 v25, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v34, 16, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v31, 16, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB26_3
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB26_4
-; GCN-NEXT:  .LBB26_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB26_3: ; %cmp.false
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v25
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v24
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v34
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v35
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v23
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v22
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v21
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v26
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v27
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v28
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v16
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v18
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v20
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v29
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v30
-; GCN-NEXT:    v_or_b32_e32 v5, v5, v31
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v32
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v33
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v17
-; GCN-NEXT:    v_or_b32_e32 v9, v9, v19
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr34
-; GCN-NEXT:    ; implicit-def: $vgpr35
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr33
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB26_2
-; GCN-NEXT:  .LBB26_4: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v25
-; GCN-NEXT:    s_mov_b32 s6, 0x30000
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v24
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v23
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v22
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v21
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v26
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v27
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v28
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v16
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v18
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GCN-NEXT:    v_or_b32_e32 v0, v34, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v35, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v20, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v29, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v30, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v31, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v32, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v33, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v17, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v19, v9
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x30000, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s6, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, s6, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, s6, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, s6, v7
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, s6, v8
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v9
+; GCN-NEXT:    s_cbranch_execz .LBB14_2
+; GCN-NEXT:  ; %bb.1: ; %cmp.true
+; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:  .LBB14_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v20i16_to_v5f64:
+; VI-LABEL: bitcast_v10f32_to_v5f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB26_2
+; VI-NEXT:    s_cbranch_execz .LBB14_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v11, 3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v9
-; VI-NEXT:    v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:    v_add_u16_e32 v10, 3, v8
-; VI-NEXT:    v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v8, v10, v8
-; VI-NEXT:    v_add_u16_e32 v10, 3, v7
-; VI-NEXT:    v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v7, v10, v7
-; VI-NEXT:    v_add_u16_e32 v10, 3, v6
-; VI-NEXT:    v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v6, v10, v6
-; VI-NEXT:    v_add_u16_e32 v10, 3, v5
-; VI-NEXT:    v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v5, v10, v5
-; VI-NEXT:    v_add_u16_e32 v10, 3, v4
-; VI-NEXT:    v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v4, v10, v4
-; VI-NEXT:    v_add_u16_e32 v10, 3, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v3, v10, v3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v2
-; VI-NEXT:    v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v10, v2
-; VI-NEXT:    v_add_u16_e32 v10, 3, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v10, v1
-; VI-NEXT:    v_add_u16_e32 v10, 3, v0
-; VI-NEXT:    v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v0, v10, v0
-; VI-NEXT:  .LBB26_2: ; %end
+; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; VI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; VI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; VI-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; VI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; VI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-NEXT:  .LBB14_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v20i16_to_v5f64:
+; GFX9-LABEL: bitcast_v10f32_to_v5f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB26_2
+; GFX9-NEXT:    s_cbranch_execz .LBB14_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB26_2: ; %end
+; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; GFX9-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; GFX9-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; GFX9-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:  .LBB14_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v20i16_to_v5f64:
+; GFX11-LABEL: bitcast_v10f32_to_v5f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
@@ -9156,31 +6289,25 @@ define <5 x double> @bitcast_v20i16_to_v5f64(<20 x i16> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:  .LBB26_2: ; %end
+; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT:  ; %bb.2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
 cmp.true:
-  %a1 = add <20 x i16> %a, splat (i16 3)
-  %a2 = bitcast <20 x i16> %a1 to <5 x double>
+  %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
+  %a2 = bitcast <10 x float> %a1 to <5 x double>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <20 x i16> %a to <5 x double>
+  %a3 = bitcast <10 x float> %a to <5 x double>
   br label %end
 
 end:
@@ -9188,113 +6315,62 @@ end:
   ret <5 x double> %phi
 }
 
-define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) {
-; GCN-LABEL: bitcast_v5f64_to_v20i16:
+define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) {
+; GCN-LABEL: bitcast_v5f64_to_v10f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v28, v9
-; GCN-NEXT:    v_mov_b32_e32 v27, v8
-; GCN-NEXT:    v_mov_b32_e32 v26, v7
-; GCN-NEXT:    v_mov_b32_e32 v25, v6
-; GCN-NEXT:    v_mov_b32_e32 v24, v5
-; GCN-NEXT:    v_mov_b32_e32 v23, v4
-; GCN-NEXT:    v_mov_b32_e32 v22, v3
-; GCN-NEXT:    v_mov_b32_e32 v21, v2
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr5
-; GCN-NEXT:    ; implicit-def: $vgpr7
-; GCN-NEXT:    ; implicit-def: $vgpr9
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB27_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_alignbit_b32 v17, v28, v27, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v26, v25, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v24, v23, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v22, v21, 16
-; GCN-NEXT:    v_alignbit_b32 v20, v1, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v24
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v22
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NEXT:  .LBB27_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB27_4
-; GCN-NEXT:  ; %bb.3: ; %cmp.true
+; GCN-NEXT:    s_cbranch_execz .LBB15_2
+; GCN-NEXT:  ; %bb.1: ; %cmp.true
+; GCN-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GCN-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; GCN-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
+; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GCN-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; GCN-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; GCN-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; GCN-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; GCN-NEXT:    v_alignbit_b32 v17, v28, v27, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v26, v25, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v24, v23, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v22, v21, 16
-; GCN-NEXT:    v_alignbit_b32 v20, v1, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v28
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v24
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v22
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NEXT:  .LBB27_4: ; %end
+; GCN-NEXT:  .LBB15_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_mov_b32_e32 v4, v21
-; GCN-NEXT:    v_mov_b32_e32 v6, v22
-; GCN-NEXT:    v_mov_b32_e32 v8, v23
-; GCN-NEXT:    v_mov_b32_e32 v10, v24
-; GCN-NEXT:    v_mov_b32_e32 v12, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v16, v27
-; GCN-NEXT:    v_mov_b32_e32 v18, v28
-; GCN-NEXT:    v_mov_b32_e32 v1, v20
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v5f64_to_v20i16:
+; VI-LABEL: bitcast_v5f64_to_v10f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB27_2
+; VI-NEXT:    s_cbranch_execz .LBB15_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; VI-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; VI-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; VI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; VI-NEXT:  .LBB27_2: ; %end
+; VI-NEXT:  .LBB15_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v5f64_to_v20i16:
+; GFX9-LABEL: bitcast_v5f64_to_v10f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB27_2
+; GFX9-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:  .LBB27_2: ; %end
+; GFX9-NEXT:  .LBB15_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v5f64_to_v20i16:
+; GFX11-LABEL: bitcast_v5f64_to_v10f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
@@ -9302,14 +6378,14 @@ define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB27_2
+; GFX11-NEXT:    s_cbranch_execz .LBB15_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:  .LBB27_2: ; %end
+; GFX11-NEXT:  .LBB15_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -9317,207 +6393,89 @@ define <20 x i16> @bitcast_v5f64_to_v20i16(<5 x double> %a, i32 %b) {
 
 cmp.true:
   %a1 = fadd <5 x double> %a, splat (double 1.000000e+00)
-  %a2 = bitcast <5 x double> %a1 to <20 x i16>
+  %a2 = bitcast <5 x double> %a1 to <10 x float>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <5 x double> %a to <20 x i16>
+  %a3 = bitcast <5 x double> %a to <10 x float>
   br label %end
 
 end:
-  %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x i16> %phi
+  %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <10 x float> %phi
 }
 
-define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) {
-; GCN-LABEL: bitcast_v20i16_to_v5i64:
+define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) {
+; GCN-LABEL: bitcast_v10f32_to_v5i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v28, v14
-; GCN-NEXT:    v_mov_b32_e32 v27, v12
-; GCN-NEXT:    v_mov_b32_e32 v26, v10
-; GCN-NEXT:    v_mov_b32_e32 v21, v8
-; GCN-NEXT:    v_mov_b32_e32 v22, v6
-; GCN-NEXT:    v_mov_b32_e32 v23, v4
-; GCN-NEXT:    v_mov_b32_e32 v24, v2
-; GCN-NEXT:    v_mov_b32_e32 v25, v0
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v34, 16, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v31, 16, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
-; GCN-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GCN-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB28_3
-; GCN-NEXT:  ; %bb.1: ; %Flow
-; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB28_4
-; GCN-NEXT:  .LBB28_2: ; %end
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB28_3: ; %cmp.false
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v25
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v24
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v34
-; GCN-NEXT:    v_or_b32_e32 v1, v1, v35
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v23
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v22
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v21
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v26
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v27
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v28
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v16
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v18
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v20
-; GCN-NEXT:    v_or_b32_e32 v3, v3, v29
-; GCN-NEXT:    v_or_b32_e32 v4, v4, v30
-; GCN-NEXT:    v_or_b32_e32 v5, v5, v31
-; GCN-NEXT:    v_or_b32_e32 v6, v6, v32
-; GCN-NEXT:    v_or_b32_e32 v7, v7, v33
-; GCN-NEXT:    v_or_b32_e32 v8, v8, v17
-; GCN-NEXT:    v_or_b32_e32 v9, v9, v19
-; GCN-NEXT:    ; implicit-def: $vgpr25
-; GCN-NEXT:    ; implicit-def: $vgpr24
-; GCN-NEXT:    ; implicit-def: $vgpr23
-; GCN-NEXT:    ; implicit-def: $vgpr22
-; GCN-NEXT:    ; implicit-def: $vgpr21
-; GCN-NEXT:    ; implicit-def: $vgpr26
-; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:    ; implicit-def: $vgpr28
-; GCN-NEXT:    ; implicit-def: $vgpr16
-; GCN-NEXT:    ; implicit-def: $vgpr18
-; GCN-NEXT:    ; implicit-def: $vgpr34
-; GCN-NEXT:    ; implicit-def: $vgpr35
-; GCN-NEXT:    ; implicit-def: $vgpr20
-; GCN-NEXT:    ; implicit-def: $vgpr29
-; GCN-NEXT:    ; implicit-def: $vgpr30
-; GCN-NEXT:    ; implicit-def: $vgpr31
-; GCN-NEXT:    ; implicit-def: $vgpr32
-; GCN-NEXT:    ; implicit-def: $vgpr33
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB28_2
-; GCN-NEXT:  .LBB28_4: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v25
-; GCN-NEXT:    s_mov_b32 s6, 0x30000
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v24
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v23
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, 3, v22
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v21
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 3, v26
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v27
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 3, v28
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v16
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 3, v18
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GCN-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GCN-NEXT:    v_or_b32_e32 v0, v34, v0
-; GCN-NEXT:    v_or_b32_e32 v1, v35, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v20, v2
-; GCN-NEXT:    v_or_b32_e32 v3, v29, v3
-; GCN-NEXT:    v_or_b32_e32 v4, v30, v4
-; GCN-NEXT:    v_or_b32_e32 v5, v31, v5
-; GCN-NEXT:    v_or_b32_e32 v6, v32, v6
-; GCN-NEXT:    v_or_b32_e32 v7, v33, v7
-; GCN-NEXT:    v_or_b32_e32 v8, v17, v8
-; GCN-NEXT:    v_or_b32_e32 v9, v19, v9
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x30000, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s6, v1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s6, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, s6, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, s6, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, s6, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, s6, v7
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, s6, v8
-; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v9
+; GCN-NEXT:    s_cbranch_execz .LBB16_2
+; GCN-NEXT:  ; %bb.1: ; %cmp.true
+; GCN-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:  .LBB16_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v20i16_to_v5i64:
+; VI-LABEL: bitcast_v10f32_to_v5i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB28_2
+; VI-NEXT:    s_cbranch_execz .LBB16_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
-; VI-NEXT:    v_mov_b32_e32 v11, 3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v9
-; VI-NEXT:    v_add_u16_sdwa v9, v9, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:    v_add_u16_e32 v10, 3, v8
-; VI-NEXT:    v_add_u16_sdwa v8, v8, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v8, v10, v8
-; VI-NEXT:    v_add_u16_e32 v10, 3, v7
-; VI-NEXT:    v_add_u16_sdwa v7, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v7, v10, v7
-; VI-NEXT:    v_add_u16_e32 v10, 3, v6
-; VI-NEXT:    v_add_u16_sdwa v6, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v6, v10, v6
-; VI-NEXT:    v_add_u16_e32 v10, 3, v5
-; VI-NEXT:    v_add_u16_sdwa v5, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v5, v10, v5
-; VI-NEXT:    v_add_u16_e32 v10, 3, v4
-; VI-NEXT:    v_add_u16_sdwa v4, v4, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v4, v10, v4
-; VI-NEXT:    v_add_u16_e32 v10, 3, v3
-; VI-NEXT:    v_add_u16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v3, v10, v3
-; VI-NEXT:    v_add_u16_e32 v10, 3, v2
-; VI-NEXT:    v_add_u16_sdwa v2, v2, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v2, v10, v2
-; VI-NEXT:    v_add_u16_e32 v10, 3, v1
-; VI-NEXT:    v_add_u16_sdwa v1, v1, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v1, v10, v1
-; VI-NEXT:    v_add_u16_e32 v10, 3, v0
-; VI-NEXT:    v_add_u16_sdwa v0, v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v0, v10, v0
-; VI-NEXT:  .LBB28_2: ; %end
+; VI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; VI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; VI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; VI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; VI-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; VI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; VI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; VI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; VI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-NEXT:  .LBB16_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v20i16_to_v5i64:
+; GFX9-LABEL: bitcast_v10f32_to_v5i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB28_2
+; GFX9-NEXT:    s_cbranch_execz .LBB16_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
-; GFX9-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB28_2: ; %end
+; GFX9-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; GFX9-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; GFX9-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; GFX9-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; GFX9-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; GFX9-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:  .LBB16_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v20i16_to_v5i64:
+; GFX11-LABEL: bitcast_v10f32_to_v5i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
@@ -9525,31 +6483,25 @@ define <5 x i64> @bitcast_v20i16_to_v5i64(<20 x i16> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:  .LBB28_2: ; %end
+; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-NEXT:    v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6
+; GFX11-NEXT:    v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4
+; GFX11-NEXT:    v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2
+; GFX11-NEXT:    v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0
+; GFX11-NEXT:  ; %bb.2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
 cmp.true:
-  %a1 = add <20 x i16> %a, splat (i16 3)
-  %a2 = bitcast <20 x i16> %a1 to <5 x i64>
+  %a1 = fadd <10 x float> %a, splat (float 1.000000e+00)
+  %a2 = bitcast <10 x float> %a1 to <5 x i64>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <20 x i16> %a to <5 x i64>
+  %a3 = bitcast <10 x float> %a to <5 x i64>
   br label %end
 
 end:
@@ -9557,81 +6509,38 @@ end:
   ret <5 x i64> %phi
 }
 
-define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
-; GCN-LABEL: bitcast_v5i64_to_v20i16:
+define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) {
+; GCN-LABEL: bitcast_v5i64_to_v10f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v18, v9
-; GCN-NEXT:    v_mov_b32_e32 v16, v8
-; GCN-NEXT:    v_mov_b32_e32 v14, v7
-; GCN-NEXT:    v_mov_b32_e32 v12, v6
-; GCN-NEXT:    v_mov_b32_e32 v20, v5
-; GCN-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-NEXT:    v_mov_b32_e32 v6, v3
-; GCN-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GCN-NEXT:    ; implicit-def: $vgpr1
-; GCN-NEXT:    ; implicit-def: $vgpr3
-; GCN-NEXT:    ; implicit-def: $vgpr5
-; GCN-NEXT:    ; implicit-def: $vgpr7
-; GCN-NEXT:    ; implicit-def: $vgpr9
-; GCN-NEXT:    ; implicit-def: $vgpr11
-; GCN-NEXT:    ; implicit-def: $vgpr13
-; GCN-NEXT:    ; implicit-def: $vgpr15
-; GCN-NEXT:    ; implicit-def: $vgpr17
-; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB29_2
-; GCN-NEXT:  ; %bb.1: ; %cmp.false
-; GCN-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v20, v8, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT:  .LBB29_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB29_4
-; GCN-NEXT:  ; %bb.3: ; %cmp.true
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GCN-NEXT:    s_cbranch_execz .LBB17_2
+; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
-; GCN-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 3, v12
-; GCN-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
-; GCN-NEXT:    v_addc_u32_e32 v18, vcc, 0, v18, vcc
-; GCN-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; GCN-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; GCN-NEXT:    v_alignbit_b32 v9, v20, v8, 16
-; GCN-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; GCN-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
-; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GCN-NEXT:  .LBB29_4: ; %end
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:  .LBB17_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v10, v20
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: bitcast_v5i64_to_v20i16:
+; VI-LABEL: bitcast_v5i64_to_v10f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB29_2
+; VI-NEXT:    s_cbranch_execz .LBB17_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
@@ -9643,18 +6552,18 @@ define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:  .LBB29_2: ; %end
+; VI-NEXT:  .LBB17_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: bitcast_v5i64_to_v20i16:
+; GFX9-LABEL: bitcast_v5i64_to_v10f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB29_2
+; GFX9-NEXT:    s_cbranch_execz .LBB17_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 3, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
@@ -9666,11 +6575,11 @@ define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:  .LBB29_2: ; %end
+; GFX9-NEXT:  .LBB17_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v5i64_to_v20i16:
+; GFX11-LABEL: bitcast_v5i64_to_v10f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, exec_lo
@@ -9678,7 +6587,7 @@ define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB29_2
+; GFX11-NEXT:    s_cbranch_execz .LBB17_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -9693,7 +6602,7 @@ define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:  .LBB29_2: ; %end
+; GFX11-NEXT:  .LBB17_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -9701,16 +6610,16 @@ define <20 x i16> @bitcast_v5i64_to_v20i16(<5 x i64> %a, i32 %b) {
 
 cmp.true:
   %a1 = add <5 x i64> %a, splat (i64 3)
-  %a2 = bitcast <5 x i64> %a1 to <20 x i16>
+  %a2 = bitcast <5 x i64> %a1 to <10 x float>
   br label %end
 
 cmp.false:
-  %a3 = bitcast <5 x i64> %a to <20 x i16>
+  %a3 = bitcast <5 x i64> %a to <10 x float>
   br label %end
 
 end:
-  %phi = phi <20 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
-  ret <20 x i16> %phi
+  %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <10 x float> %phi
 }
 
 define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
@@ -9787,7 +6696,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr22
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB30_2
+; GCN-NEXT:    s_cbranch_execz .LBB18_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v55
 ; GCN-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
@@ -9849,9 +6758,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr47
 ; GCN-NEXT:    ; implicit-def: $vgpr45
 ; GCN-NEXT:    ; implicit-def: $vgpr44
-; GCN-NEXT:  .LBB30_2: ; %Flow
+; GCN-NEXT:  .LBB18_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB30_4
+; GCN-NEXT:    s_cbranch_execz .LBB18_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v5, v47
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v6, v45
@@ -9958,7 +6867,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v30, 8, v9
 ; GCN-NEXT:    v_lshrrev_b32_e32 v23, 8, v6
 ; GCN-NEXT:    v_bfe_u32 v22, v1, 8, 8
-; GCN-NEXT:  .LBB30_4: ; %end
+; GCN-NEXT:  .LBB18_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xff, v29
 ; GCN-NEXT:    v_lshlrev_b32_e32 v38, 8, v38
@@ -10116,7 +7025,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB30_2
+; VI-NEXT:    s_cbranch_execz .LBB18_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -10138,9 +7047,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 24, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB30_2: ; %Flow
+; VI-NEXT:  .LBB18_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB30_4
+; VI-NEXT:    s_cbranch_execz .LBB18_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x200
 ; VI-NEXT:    v_add_f16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -10203,7 +7112,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    v_bfe_u32 v35, v19, 8, 8
 ; VI-NEXT:    v_bfe_u32 v38, v21, 8, 8
 ; VI-NEXT:    v_bfe_u32 v48, v23, 8, 8
-; VI-NEXT:  .LBB30_4: ; %end
+; VI-NEXT:  .LBB18_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; VI-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -10313,7 +7222,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr11
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB30_2
+; GFX9-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -10345,9 +7254,9 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB30_2: ; %Flow
+; GFX9-NEXT:  .LBB18_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB30_4
+; GFX9-NEXT:    s_cbranch_execz .LBB18_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    s_movk_i32 s6, 0x200
 ; GFX9-NEXT:    v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0]
@@ -10390,7 +7299,7 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB30_4: ; %end
+; GFX9-NEXT:  .LBB18_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -10455,217 +7364,401 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v20f16_to_v40i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB30_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB30_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB30_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB30_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v16
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v31, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v39
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v16
-; GFX11-NEXT:    v_or_b32_e32 v15, v48, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v14, v35, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v31
-; GFX11-NEXT:    v_or_b32_e32 v13, v30, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_or_b32_e32 v16, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v29
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v27
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v26
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v32
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v34
-; GFX11-NEXT:    v_or_b32_e32 v32, v33, v32
-; GFX11-NEXT:    v_or_b32_e32 v12, v25, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v16
-; GFX11-NEXT:    v_or_b32_e32 v11, v20, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v30
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v15
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v20f16_to_v40i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB18_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB18_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB18_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB18_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB18_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB18_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB18_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB18_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v31, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v48, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v35, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v38, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v33, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v25, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v20, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v15
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -10754,7 +7847,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB31_2
+; GCN-NEXT:    s_cbranch_execz .LBB19_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v2
@@ -10856,9 +7949,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr45
 ; GCN-NEXT:    ; implicit-def: $vgpr46
 ; GCN-NEXT:    ; implicit-def: $vgpr47
-; GCN-NEXT:  .LBB31_2: ; %Flow
+; GCN-NEXT:  .LBB19_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB31_4
+; GCN-NEXT:    s_cbranch_execz .LBB19_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v59
 ; GCN-NEXT:    s_movk_i32 s6, 0x300
@@ -10961,7 +8054,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v17, v24
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v34, v22
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v19, v20
-; GCN-NEXT:  .LBB31_4: ; %end
+; GCN-NEXT:  .LBB19_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v31
 ; GCN-NEXT:    v_mov_b32_e32 v2, v23
@@ -11047,7 +8140,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB31_2
+; VI-NEXT:    s_cbranch_execz .LBB19_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -11120,9 +8213,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr45
 ; VI-NEXT:    ; implicit-def: $vgpr46
 ; VI-NEXT:    ; implicit-def: $vgpr47
-; VI-NEXT:  .LBB31_2: ; %Flow
+; VI-NEXT:  .LBB19_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB31_4
+; VI-NEXT:    s_cbranch_execz .LBB19_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v55
 ; VI-NEXT:    v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -11198,7 +8291,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_or_b32_e32 v7, v12, v7
 ; VI-NEXT:    v_or_b32_e32 v8, v11, v8
 ; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:  .LBB31_4: ; %end
+; VI-NEXT:  .LBB19_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -11270,7 +8363,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB31_2
+; GFX9-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v36, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -11344,9 +8437,9 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr44
 ; GFX9-NEXT:    ; implicit-def: $vgpr47
 ; GFX9-NEXT:    ; implicit-def: $vgpr46
-; GFX9-NEXT:  .LBB31_2: ; %Flow
+; GFX9-NEXT:  .LBB19_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB31_4
+; GFX9-NEXT:    s_cbranch_execz .LBB19_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    v_add_u16_e32 v0, 3, v42
@@ -11421,7 +8514,7 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_perm_b32 v7, v12, v7, s6
 ; GFX9-NEXT:    v_perm_b32 v8, v11, v8, s6
 ; GFX9-NEXT:    v_perm_b32 v9, v10, v9, s6
-; GFX9-NEXT:  .LBB31_4: ; %end
+; GFX9-NEXT:  .LBB19_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -11434,255 +8527,496 @@ define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v40i8_to_v20f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v33, v10 :: v_dual_mov_b32 v34, v8
-; GFX11-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
-; GFX11-NEXT:    s_clause 0x9
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:4
-; GFX11-NEXT:    v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b16 v70, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB31_4
-; GFX11-NEXT:  .LBB31_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB31_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v48
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v49
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v51
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v50
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v55
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v39
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v52
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v54
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v17
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v66
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v25
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v69
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v70
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v71
-; GFX11-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v10, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v14, v13, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB31_2
-; GFX11-NEXT:  .LBB31_4: ; %cmp.true
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v0, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v66, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_add_nc_u16 v3, v65, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v28, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v2, v67, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v70, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v71, v1
-; GFX11-NEXT:    v_or_b32_e32 v3, v29, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v27, v2
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v69, v4
-; GFX11-NEXT:    v_add_nc_u16 v1, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v26, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, v24, 3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v23, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_perm_b32 v8, v11, v8, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v25, v0
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v21, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v64, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v19, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v31, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v54, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v53, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v17, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v52, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v19, v33, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; GFX11-NEXT:    v_or_b32_e32 v2, v55, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v51, v4
-; GFX11-NEXT:    v_or_b32_e32 v16, v48, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v49, v17
-; GFX11-NEXT:    v_or_b32_e32 v18, v50, v18
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v0
-; GFX11-NEXT:    v_perm_b32 v0, v17, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v18, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v19, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v21, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v14, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v13, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v12, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v10, v9, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v20f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x9
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.h, 8, v35.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v33.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v35.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v37
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB19_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB19_4
+; GFX11-TRUE16-NEXT:  .LBB19_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB19_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v29.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v35.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v34.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v35.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB19_2
+; GFX11-TRUE16-NEXT:  .LBB19_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v34.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v35.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v34.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v35.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v23.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v28.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v30.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v29.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v20.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v29.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v26.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v27.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v23.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v21.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v17.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v17.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v27.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v16.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v19.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v20.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v18.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v10.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v10.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v20f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v33, v10 :: v_dual_mov_b32 v34, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x9
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v70, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB19_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB19_4
+; GFX11-FAKE16-NEXT:  .LBB19_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB19_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v70
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v71
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v14, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB19_2
+; GFX11-FAKE16-NEXT:  .LBB19_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v66, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v28, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v67, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v70, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v71, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v29, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v27, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v69, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v30, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v26, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v24, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v11, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v25, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v21, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v64, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v19, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v31, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v54, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v53, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v17, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v52, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, v33, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v55, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v51, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v48, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v49, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v50, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v39, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v17, v16, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v18, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v19, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v20, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v21, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v14, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v13, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v12, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -11728,14 +9062,14 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB32_3
+; GCN-NEXT:    s_cbranch_execnz .LBB20_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB32_4
-; GCN-NEXT:  .LBB32_2: ; %end
+; GCN-NEXT:    s_cbranch_execnz .LBB20_4
+; GCN-NEXT:  .LBB20_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB32_3: ; %cmp.false
+; GCN-NEXT:  .LBB20_3: ; %cmp.false
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v36
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
 ; GCN-NEXT:    v_or_b32_e32 v0, v35, v0
@@ -11777,8 +9111,8 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    ; implicit-def: $vgpr16
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB32_2
-; GCN-NEXT:  .LBB32_4: ; %cmp.true
+; GCN-NEXT:    s_cbranch_execz .LBB20_2
+; GCN-NEXT:  .LBB20_4: ; %cmp.true
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v36
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v35
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v34
@@ -11869,7 +9203,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB32_2
+; VI-NEXT:    s_cbranch_execz .LBB20_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x200
 ; VI-NEXT:    v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -11902,7 +9236,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_f16_e32 v0, 0x200, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v11
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v10
-; VI-NEXT:  .LBB32_2: ; %end
+; VI-NEXT:  .LBB20_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11913,7 +9247,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB32_2
+; GFX9-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    s_movk_i32 s6, 0x200
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
@@ -11926,7 +9260,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB32_2: ; %end
+; GFX9-NEXT:  .LBB20_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -11938,7 +9272,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB32_2
+; GFX11-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
@@ -11950,7 +9284,7 @@ define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) {
 ; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:  .LBB32_2: ; %end
+; GFX11-NEXT:  .LBB20_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -11997,7 +9331,7 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB33_2
+; GCN-NEXT:    s_cbranch_execz .LBB21_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
 ; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
@@ -12034,9 +9368,9 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr4
 ; GCN-NEXT:    ; implicit-def: $vgpr6
 ; GCN-NEXT:    ; implicit-def: $vgpr8
-; GCN-NEXT:  .LBB33_2: ; %Flow
+; GCN-NEXT:  .LBB21_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB33_4
+; GCN-NEXT:    s_cbranch_execz .LBB21_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
@@ -12073,7 +9407,7 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v28, v28
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; GCN-NEXT:  .LBB33_4: ; %end
+; GCN-NEXT:  .LBB21_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, v24
 ; GCN-NEXT:    v_mov_b32_e32 v1, v29
@@ -12094,14 +9428,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB33_2
+; VI-NEXT:    s_cbranch_execz .LBB21_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; VI-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; VI-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; VI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; VI-NEXT:  .LBB33_2: ; %end
+; VI-NEXT:  .LBB21_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12112,14 +9446,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB33_2
+; GFX9-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX9-NEXT:  .LBB33_2: ; %end
+; GFX9-NEXT:  .LBB21_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12131,14 +9465,14 @@ define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
 ; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; GFX11-NEXT:  .LBB33_2: ; %end
+; GFX11-NEXT:  .LBB21_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -12186,14 +9520,14 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB34_3
+; GCN-NEXT:    s_cbranch_execnz .LBB22_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB34_4
-; GCN-NEXT:  .LBB34_2: ; %end
+; GCN-NEXT:    s_cbranch_execnz .LBB22_4
+; GCN-NEXT:  .LBB22_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB34_3: ; %cmp.false
+; GCN-NEXT:  .LBB22_3: ; %cmp.false
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v36
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
 ; GCN-NEXT:    v_or_b32_e32 v0, v35, v0
@@ -12235,8 +9569,8 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    ; implicit-def: $vgpr16
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB34_2
-; GCN-NEXT:  .LBB34_4: ; %cmp.true
+; GCN-NEXT:    s_cbranch_execz .LBB22_2
+; GCN-NEXT:  .LBB22_4: ; %cmp.true
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v0, v36
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v1, v35
 ; GCN-NEXT:    v_cvt_f32_f16_e32 v2, v34
@@ -12327,7 +9661,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB34_2
+; VI-NEXT:    s_cbranch_execz .LBB22_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x200
 ; VI-NEXT:    v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -12360,7 +9694,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; VI-NEXT:    v_add_f16_e32 v0, 0x200, v0
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v11
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v10
-; VI-NEXT:  .LBB34_2: ; %end
+; VI-NEXT:  .LBB22_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12371,7 +9705,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB34_2
+; GFX9-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    s_movk_i32 s6, 0x200
 ; GFX9-NEXT:    v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0]
@@ -12384,7 +9718,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0]
-; GFX9-NEXT:  .LBB34_2: ; %end
+; GFX9-NEXT:  .LBB22_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12396,7 +9730,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
@@ -12408,7 +9742,7 @@ define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) {
 ; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:  .LBB34_2: ; %end
+; GFX11-NEXT:  .LBB22_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -12465,14 +9799,14 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr19
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB35_3
+; GCN-NEXT:    s_cbranch_execnz .LBB23_3
 ; GCN-NEXT:  ; %bb.1: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execnz .LBB35_4
-; GCN-NEXT:  .LBB35_2: ; %end
+; GCN-NEXT:    s_cbranch_execnz .LBB23_4
+; GCN-NEXT:  .LBB23_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-; GCN-NEXT:  .LBB35_3: ; %cmp.false
+; GCN-NEXT:  .LBB23_3: ; %cmp.false
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v24
@@ -12514,8 +9848,8 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr21
 ; GCN-NEXT:    ; implicit-def: $vgpr22
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB35_2
-; GCN-NEXT:  .LBB35_4: ; %cmp.true
+; GCN-NEXT:    s_cbranch_execz .LBB23_2
+; GCN-NEXT:  .LBB23_4: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v20
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v29, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 3, v27
@@ -12566,7 +9900,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB35_2
+; VI-NEXT:    s_cbranch_execz .LBB23_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
@@ -12578,7 +9912,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:  .LBB35_2: ; %end
+; VI-NEXT:  .LBB23_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12589,7 +9923,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB35_2
+; GFX9-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 3, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
@@ -12601,7 +9935,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:  .LBB35_2: ; %end
+; GFX9-NEXT:  .LBB23_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12613,7 +9947,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-NEXT:    s_cbranch_execz .LBB23_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -12628,7 +9962,7 @@ define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:  .LBB35_2: ; %end
+; GFX11-NEXT:  .LBB23_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -12707,7 +10041,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB36_2
+; GCN-NEXT:    s_cbranch_execz .LBB24_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v31
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v32
@@ -12819,9 +10153,9 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr25
 ; GCN-NEXT:    ; implicit-def: $vgpr43
 ; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:  .LBB36_2: ; %Flow
+; GCN-NEXT:  .LBB24_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB36_4
+; GCN-NEXT:    s_cbranch_execz .LBB24_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v31
 ; GCN-NEXT:    s_movk_i32 s6, 0x300
@@ -12935,7 +10269,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, s7, v8
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v9
-; GCN-NEXT:  .LBB36_4: ; %end
+; GCN-NEXT:  .LBB24_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -13008,7 +10342,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB36_2
+; VI-NEXT:    s_cbranch_execz .LBB24_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -13081,9 +10415,9 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr21
 ; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr17
-; VI-NEXT:  .LBB36_2: ; %Flow
+; VI-NEXT:  .LBB24_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB36_4
+; VI-NEXT:    s_cbranch_execz .LBB24_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v31
 ; VI-NEXT:    v_add_u16_e32 v1, 3, v32
@@ -13157,7 +10491,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v10, 0x300, v10
 ; VI-NEXT:    v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:  .LBB36_4: ; %end
+; VI-NEXT:  .LBB24_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -13231,7 +10565,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB36_2
+; GFX9-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -13304,9 +10638,9 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr21
 ; GFX9-NEXT:    ; implicit-def: $vgpr19
 ; GFX9-NEXT:    ; implicit-def: $vgpr17
-; GFX9-NEXT:  .LBB36_2: ; %Flow
+; GFX9-NEXT:  .LBB24_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB36_4
+; GFX9-NEXT:    s_cbranch_execz .LBB24_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_u16_e32 v0, 3, v31
 ; GFX9-NEXT:    v_add_u16_e32 v1, 3, v32
@@ -13380,7 +10714,7 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v9, 0x300, v9
 ; GFX9-NEXT:    v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX9-NEXT:  .LBB36_4: ; %end
+; GFX9-NEXT:  .LBB24_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -13394,291 +10728,585 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v40i8_to_v5f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x9
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:4
-; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v68, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v70, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB36_4
-; GFX11-NEXT:  .LBB36_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB36_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v67
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v68
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v69
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v70
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v71
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v54
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v65
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v66
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v39
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v51
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v52
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v53
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v27
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v29
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v17
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v23
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB36_2
-; GFX11-NEXT:  .LBB36_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v18, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v67, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v68, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v69, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v70, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v71, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v54, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v55, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v64, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v65, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v66, v9
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_add_nc_u16 v5, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v11, v50, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v48, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v51, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v52, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v53, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v27, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v29, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v17, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v19, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v21, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v23, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v25, v14
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v5f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x9
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v38.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB24_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB24_4
+; GFX11-TRUE16-NEXT:  .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB24_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT:  .LBB24_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v30.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v33.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v23.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v23.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v21.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v24.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v26.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v18.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v19.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v21.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v19.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v20.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v16.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v17.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v17.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v18.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v5f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x9
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v68, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v70, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB24_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB24_4
+; GFX11-FAKE16-NEXT:  .LBB24_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB24_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v70
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT:  .LBB24_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v18, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v67, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v68, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v69, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v70, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v71, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v54, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v55, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v64, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v65, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v66, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v50, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v48, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v51, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v52, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v53, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v27, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v29, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v17, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v19, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v21, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v23, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v25, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -13733,7 +11361,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr20
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB37_2
+; GCN-NEXT:    s_cbranch_execz .LBB25_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_alignbit_b32 v11, v10, v9, 24
 ; GCN-NEXT:    v_alignbit_b32 v12, v10, v9, 16
@@ -13765,9 +11393,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB37_2: ; %Flow
+; GCN-NEXT:  .LBB25_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB37_4
+; GCN-NEXT:    s_cbranch_execz .LBB25_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
 ; GCN-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
@@ -13804,7 +11432,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB37_4: ; %end
+; GCN-NEXT:  .LBB25_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v49, 0xff, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v50, 8, v35
@@ -13954,7 +11582,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB37_2
+; VI-NEXT:    s_cbranch_execz .LBB25_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -13986,9 +11614,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB37_2: ; %Flow
+; VI-NEXT:  .LBB25_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB37_4
+; VI-NEXT:    s_cbranch_execz .LBB25_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
 ; VI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
@@ -14025,7 +11653,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB37_4: ; %end
+; VI-NEXT:  .LBB25_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; VI-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -14135,7 +11763,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr11
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB37_2
+; GFX9-NEXT:    s_cbranch_execz .LBB25_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -14167,9 +11795,9 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB37_2: ; %Flow
+; GFX9-NEXT:  .LBB25_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB37_4
+; GFX9-NEXT:    s_cbranch_execz .LBB25_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
 ; GFX9-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
@@ -14206,7 +11834,7 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB37_4: ; %end
+; GFX9-NEXT:  .LBB25_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -14271,212 +11899,391 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v5f64_to_v40i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB37_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB37_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB37_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB37_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v16
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v31, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v39
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v16
-; GFX11-NEXT:    v_or_b32_e32 v15, v48, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v14, v35, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v31
-; GFX11-NEXT:    v_or_b32_e32 v13, v30, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_or_b32_e32 v16, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v29
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v27
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v26
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v32
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v34
-; GFX11-NEXT:    v_or_b32_e32 v32, v33, v32
-; GFX11-NEXT:    v_or_b32_e32 v12, v25, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v16
-; GFX11-NEXT:    v_or_b32_e32 v11, v20, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v30
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v15
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v5f64_to_v40i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB25_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB25_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB25_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB25_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB25_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB25_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v31, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v48, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v35, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v38, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v33, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v25, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v20, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v15
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -14553,7 +12360,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB38_2
+; GCN-NEXT:    s_cbranch_execz .LBB26_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v31
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xff, v32
@@ -14665,9 +12472,9 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr25
 ; GCN-NEXT:    ; implicit-def: $vgpr43
 ; GCN-NEXT:    ; implicit-def: $vgpr27
-; GCN-NEXT:  .LBB38_2: ; %Flow
+; GCN-NEXT:  .LBB26_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB38_4
+; GCN-NEXT:    s_cbranch_execz .LBB26_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v31
 ; GCN-NEXT:    s_movk_i32 s6, 0x300
@@ -14781,7 +12588,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, s7, v8
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v9
-; GCN-NEXT:  .LBB38_4: ; %end
+; GCN-NEXT:  .LBB26_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -14854,7 +12661,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB38_2
+; VI-NEXT:    s_cbranch_execz .LBB26_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -14927,9 +12734,9 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr21
 ; VI-NEXT:    ; implicit-def: $vgpr19
 ; VI-NEXT:    ; implicit-def: $vgpr17
-; VI-NEXT:  .LBB38_2: ; %Flow
+; VI-NEXT:  .LBB26_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB38_4
+; VI-NEXT:    s_cbranch_execz .LBB26_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u16_e32 v0, 3, v31
 ; VI-NEXT:    v_add_u16_e32 v1, 3, v32
@@ -15003,7 +12810,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; VI-NEXT:    v_add_u16_e32 v10, 0x300, v10
 ; VI-NEXT:    v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v9, v10, v9
-; VI-NEXT:  .LBB38_4: ; %end
+; VI-NEXT:  .LBB26_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; VI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -15077,7 +12884,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB38_2
+; GFX9-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -15150,9 +12957,9 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr21
 ; GFX9-NEXT:    ; implicit-def: $vgpr19
 ; GFX9-NEXT:    ; implicit-def: $vgpr17
-; GFX9-NEXT:  .LBB38_2: ; %Flow
+; GFX9-NEXT:  .LBB26_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB38_4
+; GFX9-NEXT:    s_cbranch_execz .LBB26_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_u16_e32 v0, 3, v31
 ; GFX9-NEXT:    v_add_u16_e32 v1, 3, v32
@@ -15226,7 +13033,7 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    v_add_u16_e32 v9, 0x300, v9
 ; GFX9-NEXT:    v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v9, v9, v10
-; GFX9-NEXT:  .LBB38_4: ; %end
+; GFX9-NEXT:  .LBB26_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
@@ -15240,291 +13047,585 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v40i8_to_v5i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x9
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:4
-; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v68, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v70, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v64, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_4
-; GFX11-NEXT:  .LBB38_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB38_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v67
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v68
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v69
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v70
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v71
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v54
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v55
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v64
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v65
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v66
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v39
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v51
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v52
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v53
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v27
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v29
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v17
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v23
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB38_2
-; GFX11-NEXT:  .LBB38_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v18, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v67, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v68, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v69, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v70, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v71, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v54, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v55, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v64, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v65, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v66, v9
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v4, v8, v9
-; GFX11-NEXT:    v_add_nc_u16 v5, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v22, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v11, v50, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v48, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v51, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v52, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v53, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v27, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v29, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v17, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v19, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v21, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v23, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v25, v14
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v9, v13, v14
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v40i8_to_v5i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x9
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v38.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v49
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB26_4
+; GFX11-TRUE16-NEXT:  .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB26_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v33.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v33.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v5.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT:  .LBB26_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v30.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v33.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v23.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v23.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v25.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v21.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v24.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v26.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v18.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v19.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v21.l, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v19.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v20.h, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v16.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v16.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v17.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v17.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v18.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v5.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v14, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v8, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v17
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v40i8_to_v5i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x9
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v68, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v70, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v64, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB26_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB26_4
+; GFX11-FAKE16-NEXT:  .LBB26_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB26_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v70
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v55
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v64
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v51
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT:  .LBB26_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v18, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v67, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v68, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v69, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v70, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v71, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v54, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v55, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v64, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v65, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v66, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v8, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v22, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v50, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v48, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v51, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v52, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v53, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v27, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v29, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v17, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v19, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v21, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v23, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v25, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v14
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -15579,7 +13680,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    ; implicit-def: $vgpr17
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB39_2
+; GCN-NEXT:    s_cbranch_execz .LBB27_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.false
 ; GCN-NEXT:    v_alignbit_b32 v11, v10, v9, 24
 ; GCN-NEXT:    v_alignbit_b32 v12, v10, v9, 16
@@ -15611,9 +13712,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB39_2: ; %Flow
+; GCN-NEXT:  .LBB27_2: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB39_4
+; GCN-NEXT:    s_cbranch_execz .LBB27_4
 ; GCN-NEXT:  ; %bb.3: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
@@ -15655,7 +13756,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v38, 24, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
-; GCN-NEXT:  .LBB39_4: ; %end
+; GCN-NEXT:  .LBB27_4: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    v_and_b32_e32 v49, 0xff, v1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
@@ -15805,7 +13906,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    ; implicit-def: $vgpr11
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB39_2
+; VI-NEXT:    s_cbranch_execz .LBB27_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.false
 ; VI-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; VI-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -15837,9 +13938,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB39_2: ; %Flow
+; VI-NEXT:  .LBB27_2: ; %Flow
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB39_4
+; VI-NEXT:    s_cbranch_execz .LBB27_4
 ; VI-NEXT:  ; %bb.3: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, 3, v1
 ; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
@@ -15881,7 +13982,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; VI-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; VI-NEXT:  .LBB39_4: ; %end
+; VI-NEXT:  .LBB27_4: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; VI-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -15991,7 +14092,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr11
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB39_2
+; GFX9-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX9-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
 ; GFX9-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
@@ -16023,9 +14124,9 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB39_2: ; %Flow
+; GFX9-NEXT:  .LBB27_2: ; %Flow
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB39_4
+; GFX9-NEXT:    s_cbranch_execz .LBB27_4
 ; GFX9-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, 3, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
@@ -16067,7 +14168,7 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX9-NEXT:  .LBB39_4: ; %end
+; GFX9-NEXT:  .LBB27_4: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v15, 8, v15
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v16, 8, v16
@@ -16132,220 +14233,407 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v5i64_to_v40i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB39_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB39_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB39_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-NEXT:  .LBB39_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v16
-; GFX11-NEXT:    v_and_b32_e32 v48, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v36, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v35, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v31, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v39
-; GFX11-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b16 v37, 8, v37
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v16
-; GFX11-NEXT:    v_or_b32_e32 v15, v48, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v36
-; GFX11-NEXT:    v_or_b32_e32 v14, v35, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v31
-; GFX11-NEXT:    v_or_b32_e32 v13, v30, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v39
-; GFX11-NEXT:    v_or_b32_e32 v16, v38, v37
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v29
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v28
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v27
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v26
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v34, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v33, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b16 v32, 8, v32
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v23
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v16
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v34
-; GFX11-NEXT:    v_or_b32_e32 v32, v33, v32
-; GFX11-NEXT:    v_or_b32_e32 v12, v25, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v16
-; GFX11-NEXT:    v_or_b32_e32 v11, v20, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v19
-; GFX11-NEXT:    v_or_b32_e32 v15, v18, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v30
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v13
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v14
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v15
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v5i64_to_v40i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr15_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr11_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB27_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB27_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB27_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB27_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v28.l, v2.h, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v14, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v6.h, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v29, v16
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v7.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v30, v28
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v26, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v15, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v13, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v17, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v19, v10
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB27_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB27_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB27_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[12:13], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[15:16], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB27_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v48, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v36, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v35, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v31, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v39
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v38, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v37, 8, v37
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v48, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v36
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v35, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v30, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v38, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v34, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v33, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v32, 8, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, v33, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v25, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v20, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v18, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v15
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b64 v0, v[9:10], off offset:32
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -16371,14 +14659,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB40_2
+; GCN-NEXT:    s_cbranch_execz .LBB28_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GCN-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GCN-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GCN-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GCN-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GCN-NEXT:  .LBB40_2: ; %end
+; GCN-NEXT:  .LBB28_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16389,14 +14677,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB40_2
+; VI-NEXT:    s_cbranch_execz .LBB28_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; VI-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; VI-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; VI-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; VI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; VI-NEXT:  .LBB40_2: ; %end
+; VI-NEXT:  .LBB28_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16407,14 +14695,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB40_2
+; GFX9-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GFX9-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX9-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX9-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX9-NEXT:  .LBB40_2: ; %end
+; GFX9-NEXT:  .LBB28_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16426,14 +14714,14 @@ define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB40_2
+; GFX11-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[2:3], 1.0
 ; GFX11-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
 ; GFX11-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT:  .LBB40_2: ; %end
+; GFX11-NEXT:  .LBB28_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
@@ -16461,7 +14749,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GCN-NEXT:    s_cbranch_execz .LBB41_2
+; GCN-NEXT:    s_cbranch_execz .LBB29_2
 ; GCN-NEXT:  ; %bb.1: ; %cmp.true
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -16473,7 +14761,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; GCN-NEXT:  .LBB41_2: ; %end
+; GCN-NEXT:  .LBB29_2: ; %end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16484,7 +14772,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; VI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; VI-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; VI-NEXT:    s_cbranch_execz .LBB41_2
+; VI-NEXT:    s_cbranch_execz .LBB29_2
 ; VI-NEXT:  ; %bb.1: ; %cmp.true
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -16496,7 +14784,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; VI-NEXT:  .LBB41_2: ; %end
+; VI-NEXT:  .LBB29_2: ; %end
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16507,7 +14795,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GFX9-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB41_2
+; GFX9-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX9-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -16519,7 +14807,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 3, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
-; GFX9-NEXT:  .LBB41_2: ; %end
+; GFX9-NEXT:  .LBB29_2: ; %end
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16531,7 +14819,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX11-NEXT:  ; %bb.1: ; %cmp.true
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -16546,7 +14834,7 @@ define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) {
 ; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT:  .LBB41_2: ; %end
+; GFX11-NEXT:  .LBB29_2: ; %end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 332c971e5709f..7f8b733038f1e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define float @bitcast_i32_to_f32(i32 %a, i32 %b) {
 ; GCN-LABEL: bitcast_i32_to_f32:
@@ -684,37 +685,70 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB7_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-NEXT:  .LBB7_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB7_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:  .LBB7_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB7_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB7_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -965,37 +999,64 @@ define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_i32_to_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB10_4
-; GFX11-NEXT:  .LBB10_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB10_3: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB10_2
-; GFX11-NEXT:  .LBB10_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_i32_to_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_i32_to_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX11-FAKE16-NEXT:  .LBB10_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB10_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB10_2
+; GFX11-FAKE16-NEXT:  .LBB10_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1138,59 +1199,113 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i8_to_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr0
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  .LBB11_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB11_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB11_2
-; GFX11-NEXT:  .LBB11_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v5, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i8_to_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB11_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-TRUE16-NEXT:  .LBB11_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v2.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v2.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i8_to_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB11_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-FAKE16-NEXT:  .LBB11_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v5, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v2, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1745,37 +1860,70 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB17_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-NEXT:  .LBB17_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB17_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:  .LBB17_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB17_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB17_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2026,37 +2174,64 @@ define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_f32_to_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_4
-; GFX11-NEXT:  .LBB20_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB20_3: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB20_2
-; GFX11-NEXT:  .LBB20_4: ; %cmp.true
-; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_f32_to_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_f32_to_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB20_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB20_4
+; GFX11-FAKE16-NEXT:  .LBB20_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB20_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT:  .LBB20_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2199,59 +2374,113 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i8_to_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr0
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
-; GFX11-NEXT:  .LBB21_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB21_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB21_2
-; GFX11-NEXT:  .LBB21_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v5, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i8_to_f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-TRUE16-NEXT:  .LBB21_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB21_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB21_2
+; GFX11-TRUE16-NEXT:  .LBB21_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v2.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v2.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i8_to_f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB21_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB21_4
+; GFX11-FAKE16-NEXT:  .LBB21_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB21_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT:  .LBB21_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v5, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v2, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2618,37 +2847,72 @@ define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v2i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB25_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-NEXT:  .LBB25_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:  .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2947,37 +3211,64 @@ define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2i16_to_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB28_4
-; GFX11-NEXT:  .LBB28_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB28_3: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB28_2
-; GFX11-NEXT:  .LBB28_4: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2i16_to_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2i16_to_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB28_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB28_4
+; GFX11-FAKE16-NEXT:  .LBB28_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB28_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB28_2
+; GFX11-FAKE16-NEXT:  .LBB28_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3123,59 +3414,113 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i8_to_v2i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr0
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB29_4
-; GFX11-NEXT:  .LBB29_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB29_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB29_2
-; GFX11-NEXT:  .LBB29_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v5, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i8_to_v2i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB29_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB29_4
+; GFX11-TRUE16-NEXT:  .LBB29_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB29_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB29_2
+; GFX11-TRUE16-NEXT:  .LBB29_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v2.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v2.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i8_to_v2i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB29_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB29_4
+; GFX11-FAKE16-NEXT:  .LBB29_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB29_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB29_2
+; GFX11-FAKE16-NEXT:  .LBB29_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v5, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v2, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3390,37 +3735,70 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB31_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-NEXT:  .LBB31_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB31_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:  .LBB31_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB31_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB31_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3726,37 +4104,64 @@ define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2f16_to_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB34_4
-; GFX11-NEXT:  .LBB34_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB34_3: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_2
-; GFX11-NEXT:  .LBB34_4: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2f16_to_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2f16_to_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB34_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB34_4
+; GFX11-FAKE16-NEXT:  .LBB34_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB34_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-FAKE16-NEXT:  .LBB34_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3898,59 +4303,113 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i8_to_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr0
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_4
-; GFX11-NEXT:  .LBB35_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB35_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB35_2
-; GFX11-NEXT:  .LBB35_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v5, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i8_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-TRUE16-NEXT:  .LBB35_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-TRUE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v2.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v2.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i8_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-FAKE16-NEXT:  .LBB35_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-FAKE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v5, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v2, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4063,37 +4522,70 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v1i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB36_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-NEXT:  .LBB36_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB36_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:  .LBB36_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB36_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB36_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4332,54 +4824,104 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2bf16_to_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB38_4
-; GFX11-NEXT:  .LBB38_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB38_3: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB38_2
-; GFX11-NEXT:  .LBB38_4: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v4, v0, v1, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v4.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB38_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB38_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB38_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB38_4
+; GFX11-FAKE16-NEXT:  .LBB38_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB38_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB38_2
+; GFX11-FAKE16-NEXT:  .LBB38_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v0, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4521,59 +5063,113 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i8_to_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr0
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB39_4
-; GFX11-NEXT:  .LBB39_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB39_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB39_2
-; GFX11-NEXT:  .LBB39_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v5, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i8_to_v2bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB39_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB39_4
+; GFX11-TRUE16-NEXT:  .LBB39_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB39_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB39_2
+; GFX11-TRUE16-NEXT:  .LBB39_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v2.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v2.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i8_to_v2bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB39_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB39_4
+; GFX11-FAKE16-NEXT:  .LBB39_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB39_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB39_2
+; GFX11-FAKE16-NEXT:  .LBB39_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v5, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v2, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4682,37 +5278,64 @@ define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v1i32_to_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB40_4
-; GFX11-NEXT:  .LBB40_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB40_3: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB40_2
-; GFX11-NEXT:  .LBB40_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v1i32_to_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v1i32_to_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB40_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB40_4
+; GFX11-FAKE16-NEXT:  .LBB40_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB40_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB40_2
+; GFX11-FAKE16-NEXT:  .LBB40_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4855,59 +5478,113 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i8_to_v1i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, v0
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    ; implicit-def: $vgpr0
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v4
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB41_4
-; GFX11-NEXT:  .LBB41_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB41_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB41_2
-; GFX11-NEXT:  .LBB41_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v5, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v2, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i8_to_v1i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB41_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB41_4
+; GFX11-TRUE16-NEXT:  .LBB41_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB41_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-TRUE16-NEXT:  .LBB41_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v2.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v2.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i8_to_v1i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v4
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB41_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB41_4
+; GFX11-FAKE16-NEXT:  .LBB41_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB41_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-FAKE16-NEXT:  .LBB41_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v5, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v2, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index 9bb360f2e3b09..b52128024fbc3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v3bf16_to_v3f16:
@@ -134,47 +135,92 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v3bf16_to_v3f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB0_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v1, 0x7fc0, v1, 16
-; GFX11-NEXT:  .LBB0_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB0_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v3, v7 :: v_dual_mov_b32 v3, 0x7fc0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v4, v5 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT:  .LBB0_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB0_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, 0x7fc0, v1, 16
+; GFX11-FAKE16-NEXT:  .LBB0_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -426,47 +472,95 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v3bf16_to_v3i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v1, 0x7fc0, v1, 16
-; GFX11-NEXT:  .LBB2_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, 0x7fc0, 16, v1
+; GFX11-TRUE16-NEXT:  .LBB2_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, 0x7fc0, v1, 16
+; GFX11-FAKE16-NEXT:  .LBB2_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index 0d1008082f586..c48a8459fdc3c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v16i32_to_v16f32:
@@ -3069,295 +3070,581 @@ define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v16i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v16
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB11_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v23, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v20, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v19, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v6
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
-; GFX11-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v22, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v18, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_bfe_u32 v24, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
-; GFX11-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v24, v20, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v23, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:  .LBB11_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v16, v19, v21 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v25, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v22, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v20, v24 :: v_dual_and_b32 v18, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v19, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v16, v20 :: v_dual_and_b32 v20, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v23, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v17, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v23, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v22 :: v_dual_cndmask_b32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v26, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v20, v0
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4430,332 +4717,614 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16i32_to_v64i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB12_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB12_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB12_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, 3, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 3, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 3, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 3, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 3, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 3, v7
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 3, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 3, v5
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 3, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 3, v3
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB12_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v96, 0xff, v96
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v87
-; GFX11-NEXT:    v_or_b32_e32 v24, v96, v24
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v86, v85
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xff, v83
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v82
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v80, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v71
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xff, v70
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v84
-; GFX11-NEXT:    v_or_b32_e32 v23, v83, v23
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v80
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v71
-; GFX11-NEXT:    v_or_b32_e32 v22, v70, v22
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v24
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v23
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v25
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v55
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v22
-; GFX11-NEXT:    v_or_b32_e32 v21, v23, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v22, v25, v54
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v50
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v39
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v24
-; GFX11-NEXT:    v_or_b32_e32 v23, v25, v49
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v48
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v22
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v37
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v35
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v34
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v69
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xff, v68
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-NEXT:    v_or_b32_e32 v18, v23, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v30
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v28
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v26
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v69
-; GFX11-NEXT:    v_or_b32_e32 v67, v68, v67
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v20
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v21
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16i32_to_v64i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 3, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 3, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 3, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 3, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 3, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 3, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 3, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, 3, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 3, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB12_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16i32_to_v64i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB12_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, 3, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 3, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 3, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 3, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 3, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 3, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB12_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v96, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v86, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v83, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v80
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v70, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v39, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v68, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, v68, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v21
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -6282,471 +6851,950 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64i8_to_v16i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v54, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v55, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v112, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v87, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v96, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v97, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v131
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_4
-; GFX11-NEXT:  .LBB13_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB13_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v118
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v117
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v6, v113
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v114
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v115
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v116
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v101
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v102
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v103
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v112
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v100
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v11
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v68
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v87
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v96
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v97
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v98
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v99
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v81
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v82
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v85
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v51
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v39
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v65
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v66
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v67
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v17, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v18, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v19, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v20, v24, v25
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB13_2
-; GFX11-NEXT:  .LBB13_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v5, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v37, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v118, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v119, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v2, v117, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v114, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v4, v115, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v116, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v113, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, v16, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v6, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v101, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, v100, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v102, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v103, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v112, v10
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_add_nc_u16 v6, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v10, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v68, 3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v87, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v96, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v97, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v98, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v99, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v81, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v82, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v83, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v84, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v85, v15
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_add_nc_u16 v11, v64, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v51, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v50, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v48, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_or_b32_e32 v11, v27, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v29, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v65, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v66, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v67, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v25, v24
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-TRUE16-NEXT:  .LBB13_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-TRUE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v55, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v87, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-FAKE16-NEXT:  .LBB13_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v6, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v97
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v81
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-FAKE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v37, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v118, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v119, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v117, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v114, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v115, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v116, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v113, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v16, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v100, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v102, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v103, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v112, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v68, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v87, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v96, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v97, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v98, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v99, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v81, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v82, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v83, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v84, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v85, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v64, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v51, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v50, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v48, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v27, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v65, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v66, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v67, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v25, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -9528,295 +10576,581 @@ define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v16f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v16
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB23_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v23, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v20, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v19, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v6
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
-; GFX11-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v22, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v18, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_bfe_u32 v24, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
-; GFX11-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v24, v20, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v23, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:  .LBB23_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v16f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v16, v19, v21 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v25, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v22, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v20, v24 :: v_dual_and_b32 v18, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v19, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v16, v20 :: v_dual_and_b32 v20, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v23, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v17, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v23, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v22 :: v_dual_cndmask_b32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v26, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v20, v0
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v16f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -10889,324 +12223,598 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v16f32_to_v64i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB24_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15
-; GFX11-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7
-; GFX11-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v9, 1.0, v9
-; GFX11-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11
-; GFX11-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB24_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v96, 0xff, v96
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v87
-; GFX11-NEXT:    v_or_b32_e32 v24, v96, v24
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v86, v85
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xff, v83
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v82
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v80, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v71
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xff, v70
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v84
-; GFX11-NEXT:    v_or_b32_e32 v23, v83, v23
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v80
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v71
-; GFX11-NEXT:    v_or_b32_e32 v22, v70, v22
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v24
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v23
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v25
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v55
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v22
-; GFX11-NEXT:    v_or_b32_e32 v21, v23, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v22, v25, v54
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v50
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v39
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v24
-; GFX11-NEXT:    v_or_b32_e32 v23, v25, v49
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v48
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v22
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v37
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v35
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v34
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v69
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xff, v68
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-NEXT:    v_or_b32_e32 v18, v23, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v30
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v28
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v26
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v69
-; GFX11-NEXT:    v_or_b32_e32 v67, v68, v67
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v20
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v21
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v16f32_to_v64i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB24_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v9, 1.0, v9
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB24_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v16f32_to_v64i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB24_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v9, 1.0, v9
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v96, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v86, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v83, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v80
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v70, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v39, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v68, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, v68, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v21
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -12733,471 +14341,950 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64i8_to_v16f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v54, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v55, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v112, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v87, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v96, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v97, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v131
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_4
-; GFX11-NEXT:  .LBB25_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB25_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v118
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v117
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v6, v113
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v114
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v115
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v116
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v101
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v102
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v103
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v112
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v100
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v11
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v68
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v87
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v96
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v97
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v98
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v99
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v81
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v82
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v85
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v51
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v39
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v65
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v66
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v67
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v17, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v18, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v19, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v20, v24, v25
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB25_2
-; GFX11-NEXT:  .LBB25_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v5, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v37, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v118, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v119, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v2, v117, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v114, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v4, v115, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v116, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v113, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, v16, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v6, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v101, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, v100, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v102, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v103, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v112, v10
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_add_nc_u16 v6, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v10, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v68, 3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v87, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v96, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v97, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v98, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v99, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v81, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v82, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v83, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v84, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v85, v15
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_add_nc_u16 v11, v64, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v51, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v50, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v48, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_or_b32_e32 v11, v27, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v29, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v65, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v66, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v67, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v25, v24
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v16f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-TRUE16-NEXT:  .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v16f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v55, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v87, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-FAKE16-NEXT:  .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v6, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v97
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v81
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v37, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v118, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v119, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v117, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v114, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v115, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v116, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v113, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v16, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v100, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v102, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v103, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v112, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v68, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v87, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v96, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v97, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v98, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v99, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v81, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v82, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v83, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v84, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v85, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v64, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v51, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v50, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v48, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v27, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v65, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v66, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v67, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v25, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -15763,295 +17850,581 @@ define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v8i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v16
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB33_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v23, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v20, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v19, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v6
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
-; GFX11-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v22, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v18, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_bfe_u32 v24, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
-; GFX11-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v24, v20, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v23, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:  .LBB33_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v16, v19, v21 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v25, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v22, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v20, v24 :: v_dual_and_b32 v18, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v19, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v16, v20 :: v_dual_and_b32 v20, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v23, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v17, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v23, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v22 :: v_dual_cndmask_b32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v26, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v20, v0
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -17124,337 +19497,624 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i64_to_v64i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB34_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v11, vcc_lo, v11, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v13, vcc_lo, v13, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v15, vcc_lo, v15, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB34_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v96, 0xff, v96
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v87
-; GFX11-NEXT:    v_or_b32_e32 v24, v96, v24
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v86, v85
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xff, v83
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v82
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v80, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v71
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xff, v70
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v84
-; GFX11-NEXT:    v_or_b32_e32 v23, v83, v23
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v80
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v71
-; GFX11-NEXT:    v_or_b32_e32 v22, v70, v22
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v24
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v23
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v25
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v55
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v22
-; GFX11-NEXT:    v_or_b32_e32 v21, v23, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v22, v25, v54
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v50
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v39
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v24
-; GFX11-NEXT:    v_or_b32_e32 v23, v25, v49
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v48
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v22
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v37
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v35
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v34
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v69
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xff, v68
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-NEXT:    v_or_b32_e32 v18, v23, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v30
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v28
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v26
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v69
-; GFX11-NEXT:    v_or_b32_e32 v67, v68, v67
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v20
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v21
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i64_to_v64i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB34_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB34_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v11, vcc_lo, v11, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v13, vcc_lo, v13, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v15, vcc_lo, v15, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB34_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i64_to_v64i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB34_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v1, vcc_lo, v1, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v3, vcc_lo, v3, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v5, vcc_lo, v5, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v9, vcc_lo, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v11, vcc_lo, v11, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v13, vcc_lo, v13, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v15, vcc_lo, v15, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v7, vcc_lo, v7, 3
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB34_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v96, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v86, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v83, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v80
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v70, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v39, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v68, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, v68, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v21
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -18981,471 +21641,950 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64i8_to_v8i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v54, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v55, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v112, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v87, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v96, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v97, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v131
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_4
-; GFX11-NEXT:  .LBB35_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB35_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v118
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v117
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v6, v113
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v114
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v115
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v116
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v101
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v102
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v103
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v112
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v100
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v11
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v68
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v87
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v96
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v97
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v98
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v99
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v81
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v82
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v85
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v51
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v39
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v65
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v66
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v67
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v17, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v18, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v19, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v20, v24, v25
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB35_2
-; GFX11-NEXT:  .LBB35_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v5, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v37, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v118, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v119, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v2, v117, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v114, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v4, v115, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v116, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v113, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, v16, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v6, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v101, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, v100, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v102, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v103, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v112, v10
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_add_nc_u16 v6, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v10, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v68, 3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v87, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v96, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v97, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v98, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v99, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v81, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v82, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v83, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v84, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v85, v15
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_add_nc_u16 v11, v64, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v51, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v50, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v48, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_or_b32_e32 v11, v27, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v29, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v65, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v66, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v67, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v25, v24
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-TRUE16-NEXT:  .LBB35_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-TRUE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v55, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v87, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-FAKE16-NEXT:  .LBB35_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v6, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v97
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v81
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-FAKE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v37, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v118, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v119, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v117, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v114, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v115, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v116, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v113, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v16, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v100, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v102, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v103, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v112, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v68, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v87, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v96, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v97, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v98, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v99, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v81, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v82, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v83, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v84, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v85, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v64, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v51, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v50, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v48, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v27, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v65, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v66, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v67, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v25, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -21645,295 +24784,581 @@ define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v8f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v16
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB41_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v16, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v23, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v20, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v15
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_bfe_u32 v18, v12, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v19, v10, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v10
-; GFX11-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v16, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v18, v8, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v8
-; GFX11-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v19, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_bfe_u32 v19, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v17, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v6
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
-; GFX11-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_bfe_u32 v22, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v16, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v18, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
-; GFX11-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
-; GFX11-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
-; GFX11-NEXT:    v_bfe_u32 v24, v2, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
-; GFX11-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v2
-; GFX11-NEXT:    v_bfe_u32 v22, v19, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v24, v20, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v20
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v21, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v23, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
-; GFX11-NEXT:  .LBB41_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v8f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v16, v19, v21 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v25, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v22, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v20, v24 :: v_dual_and_b32 v18, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v19, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v13, v16, v20 :: v_dual_and_b32 v20, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v18, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v20, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v16, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v18, v18, v23 :: v_dual_and_b32 v23, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v7, v16, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v17, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v16, v16, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v23, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_and_b32 v22, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v22 :: v_dual_cndmask_b32 v18, v18, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v26, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v20.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v22, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v20, v0
+; GFX11-TRUE16-NEXT:  .LBB41_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v8f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v23, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v8, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v17, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v18, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v6, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v17, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v17, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v17, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v17, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v19
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v20, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v16, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v16, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v16, v22, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v16, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v18, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v24, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v20, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v20
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v24, v20, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v23, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v21, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v20, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB41_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -23002,324 +26427,598 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8f64_to_v64i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB42_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB42_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB42_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; GFX11-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; GFX11-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; GFX11-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; GFX11-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; GFX11-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; GFX11-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; GFX11-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB42_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v96, 0xff, v96
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v87
-; GFX11-NEXT:    v_or_b32_e32 v24, v96, v24
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v86, v85
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xff, v83
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v82
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v80, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v71
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xff, v70
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v84
-; GFX11-NEXT:    v_or_b32_e32 v23, v83, v23
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v80
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v71
-; GFX11-NEXT:    v_or_b32_e32 v22, v70, v22
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v24
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v23
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v25
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v55
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v22
-; GFX11-NEXT:    v_or_b32_e32 v21, v23, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v22, v25, v54
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v50
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v39
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v24
-; GFX11-NEXT:    v_or_b32_e32 v23, v25, v49
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v48
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v22
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v37
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v35
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v34
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v69
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xff, v68
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-NEXT:    v_or_b32_e32 v18, v23, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v30
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v28
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v26
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v69
-; GFX11-NEXT:    v_or_b32_e32 v67, v68, v67
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v20
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v21
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8f64_to_v64i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB42_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB42_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB42_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-TRUE16-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB42_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8f64_to_v64i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB42_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB42_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB42_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; GFX11-FAKE16-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB42_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v96, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v86, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v83, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v80
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v70, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v39, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v68, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, v68, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v21
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -24846,471 +28545,950 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64i8_to_v8f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
-; GFX11-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v39, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v48, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v49, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v50, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v51, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v83, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v128, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v129, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v130, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v131, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v52, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v53, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v54, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v55, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v112, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v87, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v96, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v97, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b16 v98, 8, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b16 v99, 8, v65
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b16 v81, 8, v66
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b16 v82, 8, v67
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b16 v83, 8, v83
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v128
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v129
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b16 v65, 8, v130
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b16 v66, 8, v131
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v10
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_4
-; GFX11-NEXT:  .LBB43_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB43_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v34
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v118
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v119
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v35
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v117
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v6, v113
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v114
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v115
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v116
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v101
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v38
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v102
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v103
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v112
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v100
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v11
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v68
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v87
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v96
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v97
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v98
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v99
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v81
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v82
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v83
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v84
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v85
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v55
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v54
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v53
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v52
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v51
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v50
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v49
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v48
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v39
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v29
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v65
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v66
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v67
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v17
-; GFX11-NEXT:    v_or_b32_e32 v17, v18, v19
-; GFX11-NEXT:    v_or_b32_e32 v18, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v19, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v20, v24, v25
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB43_2
-; GFX11-NEXT:  .LBB43_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v31, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v32, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v35, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v5, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v37, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v118, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v119, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v2, v117, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v114, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v4, v115, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v116, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, v113, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, v16, 3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v5
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v6, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v18, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v20, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v7, v101, v7
-; GFX11-NEXT:    v_or_b32_e32 v6, v100, v6
-; GFX11-NEXT:    v_or_b32_e32 v8, v102, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v103, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v112, v10
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v9, v10
-; GFX11-NEXT:    v_add_nc_u16 v6, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, v26, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v28, 3
-; GFX11-NEXT:    v_add_nc_u16 v9, v30, 3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v10, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v11, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v71, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v68, 3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v87, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v96, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v97, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v98, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v99, v10
-; GFX11-NEXT:    v_or_b32_e32 v11, v81, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v82, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v83, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v84, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v85, v15
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v6
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v7
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v8
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v9
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v10
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v7, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v8, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v9, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v14, v15
-; GFX11-NEXT:    v_add_nc_u16 v11, v64, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, v55, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, v54, 3
-; GFX11-NEXT:    v_add_nc_u16 v14, v53, 3
-; GFX11-NEXT:    v_add_nc_u16 v15, v52, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, v51, 3
-; GFX11-NEXT:    v_add_nc_u16 v18, v50, 3
-; GFX11-NEXT:    v_add_nc_u16 v20, v49, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v48, 3
-; GFX11-NEXT:    v_add_nc_u16 v24, v39, 3
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_or_b32_e32 v11, v27, v11
-; GFX11-NEXT:    v_or_b32_e32 v12, v29, v12
-; GFX11-NEXT:    v_or_b32_e32 v13, v65, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v66, v14
-; GFX11-NEXT:    v_or_b32_e32 v15, v67, v15
-; GFX11-NEXT:    v_or_b32_e32 v16, v17, v16
-; GFX11-NEXT:    v_or_b32_e32 v17, v19, v18
-; GFX11-NEXT:    v_or_b32_e32 v18, v21, v20
-; GFX11-NEXT:    v_or_b32_e32 v19, v23, v22
-; GFX11-NEXT:    v_or_b32_e32 v20, v25, v24
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v11
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v12
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v13
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v14
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v15
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v16
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v18
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v19
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v12, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v16
-; GFX11-NEXT:    v_or_b32_e32 v14, v17, v18
-; GFX11-NEXT:    v_or_b32_e32 v15, v19, v20
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v8f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v65, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v66, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v66, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v67, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v67, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v68, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v68, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v69, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v69, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v70, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v70, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v71, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v71, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v80, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v67.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v67.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v82
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-TRUE16-NEXT:  .LBB43_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v53.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v54.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v50.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v52.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v2.h, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v6, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.h, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v24.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v26.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v6.h, v21.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v7.h, v22.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v11.h, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v12.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v18.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-TRUE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v55.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v53.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v52.h, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v52.l, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v51.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v50.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v29.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v27.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v30.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v48.l, v2.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v48.h, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v23.h, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v25.l, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v24.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v25.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v26.h, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v21.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v22.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v21.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v22.h, v7.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v6.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v11, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v13, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v23.l, v8.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v16.l, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v16.h, v11.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v17.l, v12.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v17.h, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v18.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v11.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v12.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v23, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v14, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v15, v18
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v8f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v39, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v48, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v49, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v50, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v51, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v83, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v128, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v129, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v130, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v131, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v52, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v53, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v54, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v55, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v87, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v97, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v99, 8, v65
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v66
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v82, 8, v67
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v83
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v129
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v65, 8, v130
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v66, 8, v131
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-FAKE16-NEXT:  .LBB43_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v119
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v6, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v101
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v103
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v112
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v100
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v97
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v99
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v81
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v65
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v66
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v18, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v24, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-FAKE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v31, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v32, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v35, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v37, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v118, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v119, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v117, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v114, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v115, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v116, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v113, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v16, 3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v18, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v20, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v101, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v100, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v102, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v103, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v112, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, v26, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v28, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, v30, 3
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v71, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v68, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v87, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v96, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v97, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v98, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v99, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v81, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v82, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v83, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v84, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v85, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v7
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v14, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, v64, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, v55, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, v54, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, v53, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, v52, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, v51, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, v50, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, v49, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v48, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, v39, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v27, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v29, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v65, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v66, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v67, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v17, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v19, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v21, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v23, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v25, v24
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v18
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v17, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v20
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -27227,267 +31405,563 @@ define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v32i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v16
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB47_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v11
-; GFX11-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v19, v16, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v16
-; GFX11-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_lshlrev_b32 v27, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v19, v22, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v22, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v29, 0x40c00000, v29
-; GFX11-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v31, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v20, v23 :: v_dual_lshlrev_b32 v23, 16, v4
-; GFX11-NEXT:    v_bfe_u32 v20, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v23, 0x40c00000, v23
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v21, v19, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v20, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v31, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v19, v20 :: v_dual_add_f32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-NEXT:    v_bfe_u32 v21, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v31
-; GFX11-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v21, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v12, 16, 1
-; GFX11-NEXT:    v_perm_b32 v1, v1, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v19, v20 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v21, v22, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v19, v20 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v19, v21, v22, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v22
-; GFX11-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    v_bfe_u32 v22, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v20, v21, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v20, v22, v23, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v23
-; GFX11-NEXT:    v_bfe_u32 v22, v4, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_bfe_u32 v23, v24, 16, 1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v21, v22, v4, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v21, v22, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v21, v23, v24, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v24
-; GFX11-NEXT:    v_bfe_u32 v23, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_bfe_u32 v24, v25, 16, 1
-; GFX11-NEXT:    v_perm_b32 v4, v4, v20, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v22, v23, v5, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v22, v23, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v22, v24, v25, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v25
-; GFX11-NEXT:    v_bfe_u32 v24, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v25, v26, 16, 1
-; GFX11-NEXT:    v_perm_b32 v5, v5, v21, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v23, v24, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v23, v24, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v23, v25, v26, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v26
-; GFX11-NEXT:    v_bfe_u32 v25, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v26, v27, 16, 1
-; GFX11-NEXT:    v_perm_b32 v6, v6, v22, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v24, v25, v7, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v24, v25, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v24, v26, v27, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v27
-; GFX11-NEXT:    v_bfe_u32 v26, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_bfe_u32 v27, v28, 16, 1
-; GFX11-NEXT:    v_perm_b32 v7, v7, v23, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v24, v25, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v25, v26, v8, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v25, v26, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v25, v27, v28, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v28
-; GFX11-NEXT:    v_bfe_u32 v27, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_bfe_u32 v28, v29, 16, 1
-; GFX11-NEXT:    v_perm_b32 v8, v8, v24, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v26, v27, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v26, v27, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v26, v28, v29, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v29
-; GFX11-NEXT:    v_bfe_u32 v28, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_bfe_u32 v29, v30, 16, 1
-; GFX11-NEXT:    v_perm_b32 v9, v9, v25, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v26, v27, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v27, v28, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v27, v28, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v27, v29, v30, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v30
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_bfe_u32 v30, v31, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v29, v11, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v27, v28 :: v_dual_lshlrev_b32 v28, 16, v13
-; GFX11-NEXT:    v_add3_u32 v30, v30, v31, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_add3_u32 v31, v34, v12, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-NEXT:    v_add3_u32 v29, v29, v11, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v30, v33, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_bfe_u32 v35, v28, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v28
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v31, v33, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v34, v35, v28, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_bfe_u32 v37, v13, 16, 1
-; GFX11-NEXT:    v_perm_b32 v10, v10, v26, 0x7060302
-; GFX11-NEXT:    v_dual_add_f32 v31, 0x40c00000, v35 :: v_dual_cndmask_b32 v28, v34, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v15
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    v_add3_u32 v33, v37, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v31
-; GFX11-NEXT:    v_bfe_u32 v38, v14, 16, 1
-; GFX11-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v34
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v37, v38, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_bfe_u32 v35, v15, 16, 1
-; GFX11-NEXT:    v_add3_u32 v39, v39, v34, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v37, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v15, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v14, v14, v31, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v39, v48, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v35, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v34, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_perm_b32 v12, v12, v30, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v29, v32, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v11, v11, v27, 0x7060302
-; GFX11-NEXT:  .LBB47_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v19, 0x40c00000, v1 :: v_dual_lshlrev_b32 v34, 16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v17, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v16, v1, v22 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v23, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v24, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v37, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v2 :: v_dual_cndmask_b32 v19, v23, v25
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v22, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v19.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v20, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v17, v22 :: v_dual_lshlrev_b32 v22, 16, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v17.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v19, v20, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v3, v3, 16, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v20, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v21, v21, v25 :: v_dual_add_f32 v24, 0x40c00000, v24
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v22, v25 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v25, 16, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v20, v26, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, v23, v26 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v22, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v5, v5, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v24, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_lshlrev_b32 v27, 16, v9
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v23, v28, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v25, v28 :: v_dual_add_f32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v24, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v6, v6, 16, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v26, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v29, 16, v11
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v25, v30, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v26, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v27, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v27, v30 :: v_dual_add_f32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v27, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v26, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v28, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v9, 16, v25
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v7, v7, 16, v23
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v28, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_lshlrev_b32 v31, 16, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v27, v32, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v28, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v29, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v12, v29, v32 :: v_dual_add_f32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v31, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v28, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v32, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v30, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v29, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v30, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v34, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v38, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v11, 16, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v35, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v30, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v12, 16, v28
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v10, 16, v26
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v8, v8, 16, v24
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v37, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v4, v4, 16, v20
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v17, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v16, 16, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v15.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v36, v38, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v22
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v30, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v13.h
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v13, 16, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v29, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v14, 16, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v13, v30, 16, v29
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v11
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_lshlrev_b32 v27, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v19, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v22, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v29, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v20, v23 :: v_dual_lshlrev_b32 v23, 16, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v21, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v20, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v19, v20 :: v_dual_add_f32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v19, v20 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v20 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v21, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v22, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v21, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v23, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v23, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v24, v25, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v21, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v24, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v25, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v22, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v25, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v26, v27, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v23, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v25, v26, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v25, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v25, v27, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v24, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v26, v27, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v26, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v26, v28, v29, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v25, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v27, v28, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v27, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v27, v29, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v11, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v28 :: v_dual_lshlrev_b32 v28, 16, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v30, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v31, v34, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v29, v29, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v30, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v31, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v35, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v26, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v31, 0x40c00000, v35 :: v_dual_cndmask_b32 v28, v34, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v37, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v31, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v35, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v30, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v29, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v27, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -29097,332 +33571,614 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32i16_to_v64i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB48_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB48_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v96, 0xff, v96
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v87
-; GFX11-NEXT:    v_or_b32_e32 v24, v96, v24
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v86, v85
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xff, v83
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v82
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v80, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v71
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xff, v70
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v84
-; GFX11-NEXT:    v_or_b32_e32 v23, v83, v23
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v80
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v71
-; GFX11-NEXT:    v_or_b32_e32 v22, v70, v22
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v24
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v23
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v25
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v55
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v22
-; GFX11-NEXT:    v_or_b32_e32 v21, v23, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v22, v25, v54
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v50
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v39
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v24
-; GFX11-NEXT:    v_or_b32_e32 v23, v25, v49
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v48
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v22
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v37
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v35
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v34
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v69
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xff, v68
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-NEXT:    v_or_b32_e32 v18, v23, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v30
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v28
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v26
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v69
-; GFX11-NEXT:    v_or_b32_e32 v67, v68, v67
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v20
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v21
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32i16_to_v64i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB48_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB48_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32i16_to_v64i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB48_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB48_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v96, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v86, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v83, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v80
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v70, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v39, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v68, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, v68, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v21
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -31132,409 +35888,795 @@ define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64i8_to_v32i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10
-; GFX11-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v83, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v81, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b16 v98, 8, v96
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v100
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b16 v96, 8, v101
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v102
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b16 v112, 8, v103
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v113
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v114
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v115
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v116
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v117
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v4
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_4
-; GFX11-NEXT:  .LBB49_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v35
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v50
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v51
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v39
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v54
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v48
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v55
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v49
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v53
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v52
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v83
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v81
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v10, v9, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v84
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v21
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v19
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v27
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v23
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v98
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v25
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v96
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v29
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v112
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v103
-; GFX11-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v11, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v13, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v15, v14, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v82
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v99
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v66
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v67
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v101
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v100
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v114
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v102
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v117
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v113
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v116
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v115
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v118
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v119
-; GFX11-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v14, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v16, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v18, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v20, v19, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB49_2
-; GFX11-NEXT:  .LBB49_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v66, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v64, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u16 v6, v97, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v118, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v119, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v99, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v64, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v116, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v115, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v113, v1
-; GFX11-NEXT:    v_add_nc_u16 v66, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v114, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v117, v4
-; GFX11-NEXT:    v_add_nc_u16 v2, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v67, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_nc_u16 v0, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v82, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_nc_u16 v3, v87, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v84, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v102, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v101, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v100, v1
-; GFX11-NEXT:    v_add_nc_u16 v69, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v112, v3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v103, v4
-; GFX11-NEXT:    v_add_nc_u16 v70, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v1, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v68, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v0, v71, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v65, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v28, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v96, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v29, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v98, v0
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v25, v3
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v27, v4
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v26, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v20, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v83, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v81, v4
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v31, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v32, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v53, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v55, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v54, v1
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v17, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v52, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v35, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v1
-; GFX11-NEXT:    v_or_b32_e32 v17, v50, v17
-; GFX11-NEXT:    v_or_b32_e32 v21, v51, v21
-; GFX11-NEXT:    v_or_b32_e32 v22, v39, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v48, v23
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v2
-; GFX11-NEXT:    v_perm_b32 v0, v21, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v23, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v24, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v26, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v16, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v20, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v25, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v68, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v70, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v69, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v66, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v64, v15, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v29.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-TRUE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v55.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v49.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v48.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v28.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v29.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v30.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v21.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v17.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v25.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v26.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v16.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v17.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v27.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v22.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v20.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v20.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v23.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v23.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v96
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v100
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v101
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v102
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v103
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v113
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v114
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v115
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v116
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v81
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v112
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v103
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v9, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v13, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v15, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v119
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v14, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v16, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-FAKE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v66, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v64, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v97, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v118, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v119, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v99, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v64, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v116, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v115, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v113, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v66, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v114, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v117, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v82, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v84, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v102, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v101, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v100, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v69, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v112, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v103, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v70, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v68, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v71, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v28, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v96, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v29, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v25, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v27, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v30, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v26, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v20, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v83, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v81, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v31, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v18, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v32, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v53, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v55, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v54, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v17, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v52, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v35, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v50, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v51, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v39, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v48, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v21, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v23, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v24, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v16, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v20, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v25, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v68, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v70, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v69, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v66, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -32899,267 +38041,550 @@ define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v32f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v16
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB51_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v11
-; GFX11-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v21, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v19, v16, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v16
-; GFX11-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_lshlrev_b32 v27, 16, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v19, v22, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v17
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v22, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v29, 0x40c00000, v29
-; GFX11-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v31, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v20, v23 :: v_dual_lshlrev_b32 v23, 16, v4
-; GFX11-NEXT:    v_bfe_u32 v20, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v23, 0x40c00000, v23
-; GFX11-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v21, v19, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v19, v20, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v31, 0x40c00000, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v19, v20 :: v_dual_add_f32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-NEXT:    v_bfe_u32 v21, v18, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v18
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v31
-; GFX11-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v21, v2, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v34, v12, 16, 1
-; GFX11-NEXT:    v_perm_b32 v1, v1, v17, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v18, v19, v20 :: v_dual_and_b32 v7, 0xffff0000, v7
-; GFX11-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v21, v22, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v19, v20 :: v_dual_and_b32 v11, 0xffff0000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v19, v21, v22, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v22
-; GFX11-NEXT:    v_bfe_u32 v21, v3, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    v_bfe_u32 v22, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; GFX11-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v20, v21, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v20, v22, v23, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v23
-; GFX11-NEXT:    v_bfe_u32 v22, v4, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_bfe_u32 v23, v24, 16, 1
-; GFX11-NEXT:    v_perm_b32 v3, v3, v19, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v21, v22, v4, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v21, v22, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v21, v23, v24, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v24
-; GFX11-NEXT:    v_bfe_u32 v23, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_bfe_u32 v24, v25, 16, 1
-; GFX11-NEXT:    v_perm_b32 v4, v4, v20, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v22, v23, v5, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v22, v23, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v22, v24, v25, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v25
-; GFX11-NEXT:    v_bfe_u32 v24, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v25, v26, 16, 1
-; GFX11-NEXT:    v_perm_b32 v5, v5, v21, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v23, v24, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v6
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v23, v24, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v23, v25, v26, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v26
-; GFX11-NEXT:    v_bfe_u32 v25, v7, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v26, v27, 16, 1
-; GFX11-NEXT:    v_perm_b32 v6, v6, v22, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v24, v25, v7, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v24, v25, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v24, v26, v27, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v27
-; GFX11-NEXT:    v_bfe_u32 v26, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_bfe_u32 v27, v28, 16, 1
-; GFX11-NEXT:    v_perm_b32 v7, v7, v23, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v24, v25, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v25, v26, v8, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v8
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v25, v26, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v25, v27, v28, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v26, 0x400000, v28
-; GFX11-NEXT:    v_bfe_u32 v27, v9, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_bfe_u32 v28, v29, 16, 1
-; GFX11-NEXT:    v_perm_b32 v8, v8, v24, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v26, v27, v9, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v26, v27, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v26, v28, v29, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v27, 0x400000, v29
-; GFX11-NEXT:    v_bfe_u32 v28, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
-; GFX11-NEXT:    v_bfe_u32 v29, v30, 16, 1
-; GFX11-NEXT:    v_perm_b32 v9, v9, v25, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v26, v27, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v27, v28, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v10
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v27, v28, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v27, v29, v30, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v28, 0x400000, v30
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_bfe_u32 v30, v31, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v29, v11, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v27, v27, v28 :: v_dual_lshlrev_b32 v28, 16, v13
-; GFX11-NEXT:    v_add3_u32 v30, v30, v31, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_add3_u32 v31, v34, v12, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; GFX11-NEXT:    v_add3_u32 v29, v29, v11, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v30, v33, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_bfe_u32 v35, v28, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v28
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v31, v33, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v34, v35, v28, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_bfe_u32 v37, v13, 16, 1
-; GFX11-NEXT:    v_perm_b32 v10, v10, v26, 0x7060302
-; GFX11-NEXT:    v_dual_add_f32 v31, 0x40c00000, v35 :: v_dual_cndmask_b32 v28, v34, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v15
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    v_add3_u32 v33, v37, v13, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v31
-; GFX11-NEXT:    v_bfe_u32 v38, v14, 16, 1
-; GFX11-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v34, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v34
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v37, v38, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_bfe_u32 v35, v15, 16, 1
-; GFX11-NEXT:    v_add3_u32 v39, v39, v34, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v49, 0x400000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v37, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-NEXT:    v_add3_u32 v35, v35, v15, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v14, v14, v31, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v39, v48, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v35, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v15, v15, v34, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v33, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_perm_b32 v12, v12, v30, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v29, v32, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v11, v11, v27, 0x7060302
-; GFX11-NEXT:  .LBB51_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v32f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v16, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v17, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v0, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v16, v21, v23 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v17, v22, v24 :: v_dual_add_f32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v22, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v18, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v22, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v23, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v17
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v16, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v22, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v24 :: v_dual_add_f32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v3, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v24, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v19, v19, v23 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v24, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v25, v22, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v26
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v22, v24, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v23, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v21, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v26, v23, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v26, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v24, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v26, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v21, v21, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v26, v27, v24, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v5, v22
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v23, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v24, v26, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v25, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v4, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v28, v25, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v28, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v26, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v28, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v23, v23, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    v_add3_u32 v28, v29, v26, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v26
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v25, v27, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v30
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v25, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v28, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v27, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v25, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v25, v30, v27, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v30, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v31, v28, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v30, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v25, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v30, v31, v28, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v28
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v6, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v27, v29, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v27, v11, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v29, 0x40c00000, v32
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v7, v24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v27, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v30, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v30, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v27, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v27, v32, v29, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v30, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v32, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v27, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v33, v30, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v29, v31, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v29, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-TRUE16-NEXT:    v_add3_u32 v29, v29, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_add_f32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v48, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v31, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v36, v32, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v32
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v36, v36, v32, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v31, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v38, v38, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v36, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v12, v27
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v11, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v32.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v14, v35, v48, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v10, v25
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v9, v26
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v15, v38, v49, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v32, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v29, v34, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v33, v37, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v13, v30
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v14, v29
+; GFX11-TRUE16-NEXT:  .LBB51_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v32f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v16
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v11
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v17, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_lshlrev_b32 v27, 16, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v19, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v22, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v29, 0x40c00000, v29
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v31, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v20, v23 :: v_dual_lshlrev_b32 v23, 16, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v16, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v21, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v20, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v31, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v19, v20 :: v_dual_add_f32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v12, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v18, v19, v20 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v19, v20 :: v_dual_and_b32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v21, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v2, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v19, v20, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v21, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v20, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v22, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v20, v20, v21, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v22, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v21, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v23, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v23, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v4, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v21, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v23, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v24, v25, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v26, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v21, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v22, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v24, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v23, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v23, v25, v26, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v27, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v22, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v25, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v24, v26, v27, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v26, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v23, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v24, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v25, v26, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v25, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v25, v27, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v27, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v29, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v8, v24, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v25, v25, v26, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v26, v27, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v26, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v26, v28, v29, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v28, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v9, v25, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v26, v26, v27, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v27, v28, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v27, v28, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v27, v29, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v29, v11, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v27, v27, v28 :: v_dual_lshlrev_b32 v28, 16, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v30, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_add3_u32 v31, v34, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v29, v29, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v30, v30, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v28, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v28
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v31, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v34, v35, v28, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v37, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v10, v26, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v31, 0x40c00000, v35 :: v_dual_cndmask_b32 v28, v34, v36
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v37, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v34, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v38, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v39, v34, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v48, 0x400000, v34
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v35, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v37, v38, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v38, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v39, v39, v34, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v49, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v37, v38, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v14, v31, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v34, v39, v48, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v35, v49, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v15, v34, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v33, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v12, v30, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v13, v28, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v29, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v11, v27, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB51_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -34602,332 +40027,614 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32f16_to_v64i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB52_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB52_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB52_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB52_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v96, 0xff, v96
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v86, 0xff, v86
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v87
-; GFX11-NEXT:    v_or_b32_e32 v24, v96, v24
-; GFX11-NEXT:    v_lshlrev_b16 v85, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_or_b32_e32 v25, v86, v85
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_lshlrev_b16 v84, 8, v84
-; GFX11-NEXT:    v_and_b32_e32 v83, 0xff, v83
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v82
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v80, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_lshlrev_b16 v71, 8, v71
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v81
-; GFX11-NEXT:    v_and_b32_e32 v70, 0xff, v70
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v84
-; GFX11-NEXT:    v_or_b32_e32 v23, v83, v23
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v80
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v71
-; GFX11-NEXT:    v_or_b32_e32 v22, v70, v22
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v24
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v23
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v25
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v22
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v66
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v55
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v53
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v22
-; GFX11-NEXT:    v_or_b32_e32 v21, v23, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v22, v25, v54
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v50
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v48
-; GFX11-NEXT:    v_and_b32_e32 v39, 0xff, v39
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v53
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v24
-; GFX11-NEXT:    v_or_b32_e32 v23, v25, v49
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v48
-; GFX11-NEXT:    v_or_b32_e32 v19, v39, v19
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v22
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v38
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v37
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v35
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v34
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v33
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v69, 8, v69
-; GFX11-NEXT:    v_and_b32_e32 v68, 0xff, v68
-; GFX11-NEXT:    v_lshlrev_b16 v67, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-NEXT:    v_or_b32_e32 v18, v23, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v32
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v30
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v29
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v28
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v27
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v26
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v69
-; GFX11-NEXT:    v_or_b32_e32 v67, v68, v67
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v67
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v20
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v21
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32f16_to_v64i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB52_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB52_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB52_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-TRUE16-NEXT:  .LBB52_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v1.h, v18.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v2.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v19.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v53, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-TRUE16-NEXT:    v_or_b16 v24.l, v3.h, v20.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v54, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v49.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v49, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v54, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v25, v22
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v22.l, v7.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v35.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v53, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v23.l, v8.h, v20.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v34.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v21, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v32.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v35, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v24, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v34
+; GFX11-TRUE16-NEXT:    v_or_b16 v20.l, v12.h, v18.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v13.h, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v14.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v26.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v51, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v25, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v49, v50
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v51, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v35, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v21, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v22, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v19, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32f16_to_v64i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB52_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB52_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB52_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB52_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v96, 0xff, v96
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v86, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v96, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v86, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v84, 8, v84
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v83, 0xff, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v82
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v80, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v71, 8, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v70, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v83, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v80
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v71
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v70, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v55
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v50
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v48
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v39, 0xff, v39
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v39, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v35
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v69, 8, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v68, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v67, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v30
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v69
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v67, v68, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v67, 16, v67
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v21
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -36575,409 +42282,795 @@ define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64i8_to_v32f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10
-; GFX11-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v83, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v81, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b16 v98, 8, v96
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v100
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b16 v96, 8, v101
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v102
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b16 v112, 8, v103
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v113
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v114
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v115
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v116
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v117
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v4
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_4
-; GFX11-NEXT:  .LBB53_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v35
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v50
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v51
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v39
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v54
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v48
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v55
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v49
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v53
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v52
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v83
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v81
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v10, v9, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v84
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v21
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v19
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v27
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v23
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v98
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v25
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v96
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v29
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v112
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v103
-; GFX11-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v11, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v13, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v15, v14, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v82
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v99
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v66
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v67
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v101
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v100
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v114
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v102
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v117
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v113
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v116
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v115
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v118
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v119
-; GFX11-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v14, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v16, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v18, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v20, v19, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB53_2
-; GFX11-NEXT:  .LBB53_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v66, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v64, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u16 v6, v97, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v118, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v119, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v99, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v64, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v116, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v115, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v113, v1
-; GFX11-NEXT:    v_add_nc_u16 v66, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v114, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v117, v4
-; GFX11-NEXT:    v_add_nc_u16 v2, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v67, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_nc_u16 v0, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v82, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_nc_u16 v3, v87, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v84, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v102, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v101, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v100, v1
-; GFX11-NEXT:    v_add_nc_u16 v69, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v112, v3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v103, v4
-; GFX11-NEXT:    v_add_nc_u16 v70, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v1, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v68, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v0, v71, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v65, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v28, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v96, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v29, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v98, v0
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v25, v3
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v27, v4
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v26, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v20, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v83, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v81, v4
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v31, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v32, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v53, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v55, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v54, v1
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v17, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v52, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v35, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v1
-; GFX11-NEXT:    v_or_b32_e32 v17, v50, v17
-; GFX11-NEXT:    v_or_b32_e32 v21, v51, v21
-; GFX11-NEXT:    v_or_b32_e32 v22, v39, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v48, v23
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v2
-; GFX11-NEXT:    v_perm_b32 v0, v21, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v23, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v24, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v26, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v16, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v20, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v25, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v68, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v70, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v69, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v66, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v64, v15, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v29.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-TRUE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v55.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v49.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v48.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v28.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v29.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v30.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v21.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v17.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v25.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v26.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v16.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v17.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v27.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v22.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v20.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v20.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v23.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v23.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v96
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v100
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v101
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v102
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v103
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v113
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v114
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v115
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v116
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v81
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v112
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v103
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v9, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v13, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v15, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v119
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v14, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v16, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-FAKE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v66, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v64, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v97, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v118, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v119, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v99, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v64, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v116, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v115, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v113, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v66, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v114, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v117, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v82, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v84, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v102, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v101, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v100, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v69, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v112, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v103, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v70, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v68, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v71, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v28, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v96, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v29, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v25, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v27, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v30, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v26, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v20, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v83, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v81, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v31, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v18, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v32, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v53, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v55, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v54, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v17, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v52, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v35, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v50, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v51, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v39, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v48, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v21, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v23, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v24, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v16, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v20, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v25, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v68, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v70, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v69, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v66, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -38952,587 +45045,1166 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v32bf16_to_v64i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB54_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 24, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 24, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 24, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 24, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
-; GFX11-NEXT:  .LBB54_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB54_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v18, 0x40c00000, v18
-; GFX11-NEXT:    v_add_f32_e32 v19, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v1, v17, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_bfe_u32 v22, v18, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v24, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v1, v1, v17, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
-; GFX11-NEXT:    v_add3_u32 v22, v22, v18, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v20 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_add3_u32 v20, v24, v19, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v21, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v21, v21, v2, 0x7fff
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v21, v23 :: v_dual_lshlrev_b32 v21, 16, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v19
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v27, v2, v1, 0x7060302
-; GFX11-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v21, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v22, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v18
-; GFX11-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v23, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v20, v18, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v24, v21, 16, 1
-; GFX11-NEXT:    v_perm_b32 v26, v19, v17, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v19, v4, 16, 1
-; GFX11-NEXT:    v_add3_u32 v20, v20, v18, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; GFX11-NEXT:    v_add3_u32 v19, v19, v4, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v20, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 24, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v26
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v19, v23 :: v_dual_lshlrev_b32 v23, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_add3_u32 v19, v24, v21, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_perm_b32 v29, v4, v18, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v19, v19, v22 :: v_dual_lshlrev_b32 v22, 16, v5
-; GFX11-NEXT:    v_add_f32_e32 v21, 0x40c00000, v23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v20, v20, v3, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
-; GFX11-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v20, v24, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v20, v21, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v6
-; GFX11-NEXT:    v_bfe_u32 v25, v22, 16, 1
-; GFX11-NEXT:    v_perm_b32 v28, v3, v19, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v3, v6, 16, 1
-; GFX11-NEXT:    v_add3_u32 v20, v20, v21, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
-; GFX11-NEXT:    v_add3_u32 v3, v3, v6, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_cndmask_b32 v20, v20, v23 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add3_u32 v6, v25, v22, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v23, 0x400000, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v24, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v21, v21, v5, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v6, v23, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v6, 0x40c00000, v24
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v7
-; GFX11-NEXT:    v_perm_b32 v86, v3, v20, 0x7060302
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v21, v25 :: v_dual_add_f32 v8, 0x40c00000, v8
-; GFX11-NEXT:    v_bfe_u32 v21, v6, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
-; GFX11-NEXT:    v_perm_b32 v85, v5, v22, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v5, v8, 16, 1
-; GFX11-NEXT:    v_add3_u32 v21, v21, v6, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v85
-; GFX11-NEXT:    v_add3_u32 v5, v5, v8, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v21, v24, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v25, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
-; GFX11-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_perm_b32 v83, v5, v6, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v30, v23, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v23
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
-; GFX11-NEXT:    v_add3_u32 v8, v30, v23, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v23, v8, v24 :: v_dual_lshlrev_b32 v24, 16, v9
-; GFX11-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; GFX11-NEXT:    v_add_f32_e32 v8, 0x40c00000, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v31, v24, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v8
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; GFX11-NEXT:    v_bfe_u32 v21, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v30, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v21, v21, v7, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v21, v30, vcc_lo
-; GFX11-NEXT:    v_bfe_u32 v21, v8, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_or_b32_e32 v30, 0x400000, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v82, v7, v23, 0x7060302
-; GFX11-NEXT:    v_bfe_u32 v7, v10, 16, 1
-; GFX11-NEXT:    v_add3_u32 v21, v21, v8, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v82
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v10, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v21, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_add3_u32 v10, v31, v24, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 8, v82
-; GFX11-NEXT:    v_dual_cndmask_b32 v7, v7, v30 :: v_dual_lshlrev_b32 v30, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v70, v7, v8, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v7
-; GFX11-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v10, v25, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v10, 0x40c00000, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 8, v70
-; GFX11-NEXT:    v_bfe_u32 v30, v12, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v30, v30, v12, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v21, v9, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v9
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v21, v21, v9, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v21, v31, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
-; GFX11-NEXT:    v_bfe_u32 v25, v10, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_perm_b32 v69, v9, v24, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v9, 0x40c00000, v21
-; GFX11-NEXT:    v_add3_u32 v21, v25, v10, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v10
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v12
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_bfe_u32 v32, v9, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v69
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v21, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
-; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v9
-; GFX11-NEXT:    v_add3_u32 v21, v32, v9, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 8, v69
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v30, v31, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; GFX11-NEXT:    v_perm_b32 v55, v12, v10, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v21, v25, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v21, 0x40c00000, v30
-; GFX11-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; GFX11-NEXT:    v_bfe_u32 v31, v11, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v30, 0x400000, v11
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_bfe_u32 v32, v21, 16, 1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 8, v55
-; GFX11-NEXT:    v_add3_u32 v25, v31, v11, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v25, v30, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v25, 0x40c00000, v31
-; GFX11-NEXT:    v_add3_u32 v30, v32, v21, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v21
-; GFX11-NEXT:    v_bfe_u32 v32, v14, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v30, v31, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v30, v32, v14, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v31, 0x400000, v14
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
-; GFX11-NEXT:    v_add3_u32 v32, v33, v25, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v31, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v30, 0x40c00000, v33
-; GFX11-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; GFX11-NEXT:    v_perm_b32 v54, v11, v9, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v50, v14, v21, 0x7060302
-; GFX11-NEXT:    v_dual_cndmask_b32 v25, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15
-; GFX11-NEXT:    v_bfe_u32 v35, v13, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v32, 0x400000, v13
-; GFX11-NEXT:    v_bfe_u32 v33, v30, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_add3_u32 v31, v35, v13, 0x7fff
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 8, v50
-; GFX11-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v31, 0x40c00000, v34
-; GFX11-NEXT:    v_add3_u32 v32, v33, v30, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v30
-; GFX11-NEXT:    v_bfe_u32 v34, v16, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
-; GFX11-NEXT:    v_bfe_u32 v35, v31, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v30, v15, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v31
-; GFX11-NEXT:    v_or_b32_e32 v37, 0x400000, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v33, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v33, v34, v16, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v16
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
-; GFX11-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v30, v30, v15, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v49, v13, v25, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v35, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v54
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v30, v37, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v37, v16, v32, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v32
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v36, v15, v31, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 24, v37
-; GFX11-NEXT:    v_lshrrev_b64 v[17:18], 24, v[36:37]
-; GFX11-NEXT:    v_lshrrev_b64 v[18:19], 24, v[49:50]
-; GFX11-NEXT:    v_lshrrev_b64 v[19:20], 24, v[54:55]
-; GFX11-NEXT:    v_lshrrev_b64 v[20:21], 24, v[69:70]
-; GFX11-NEXT:    v_lshrrev_b64 v[21:22], 24, v[82:83]
-; GFX11-NEXT:    v_lshrrev_b64 v[22:23], 24, v[85:86]
-; GFX11-NEXT:    v_lshrrev_b64 v[23:24], 24, v[28:29]
-; GFX11-NEXT:    v_lshrrev_b64 v[24:25], 24, v[26:27]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 8, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v50
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 24, v55
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 24, v70
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 24, v83
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 8, v83
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 24, v86
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 8, v86
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 24, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v26
-; GFX11-NEXT:  .LBB54_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v97
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v25
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v27
-; GFX11-NEXT:    v_or_b32_e32 v24, v26, v24
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v81
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v96
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b16 v28, 8, v28
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v25
-; GFX11-NEXT:    v_or_b32_e32 v25, v26, v27
-; GFX11-NEXT:    v_and_b32_e32 v81, 0xff, v87
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v24
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v29
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v28
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v86
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v68
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v85
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v84
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v22
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v83
-; GFX11-NEXT:    v_and_b32_e32 v65, 0xff, v65
-; GFX11-NEXT:    v_lshlrev_b16 v68, 8, v82
-; GFX11-NEXT:    v_or_b32_e32 v23, v81, v23
-; GFX11-NEXT:    v_or_b32_e32 v25, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v27
-; GFX11-NEXT:    v_or_b32_e32 v22, v28, v22
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v29
-; GFX11-NEXT:    v_or_b32_e32 v26, v65, v68
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v24
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v23
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v25
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v22
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v26
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v80
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v71
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v70
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v52
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v69
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v67
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v22
-; GFX11-NEXT:    v_or_b32_e32 v21, v23, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v24
-; GFX11-NEXT:    v_or_b32_e32 v22, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v27
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v66
-; GFX11-NEXT:    v_lshlrev_b16 v20, 8, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v64
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v38
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v55
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v54
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v53
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_or_b32_e32 v20, v23, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v24
-; GFX11-NEXT:    v_or_b32_e32 v23, v25, v26
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v27
-; GFX11-NEXT:    v_or_b32_e32 v19, v28, v19
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v21
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v22
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v20
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v23
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v19
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v51
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v33
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v50
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v49
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v48
-; GFX11-NEXT:    v_lshlrev_b16 v18, 8, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v39
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v19, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v22
-; GFX11-NEXT:    v_or_b32_e32 v18, v23, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v24
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v31
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v37
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-NEXT:    v_lshlrev_b16 v22, 8, v36
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v35
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v24, 8, v34
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v26, 8, v32
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v24
-; GFX11-NEXT:    v_or_b32_e32 v21, v25, v26
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v19
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v18
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v20
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v17
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v21
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
-; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v32bf16_to_v64i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v70.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v85.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v84.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v81.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v16.h
+; GFX11-TRUE16-NEXT:  .LBB54_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v26, v20, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v26.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v27, v21, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v27
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v28, v20, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 24, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v28.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v20, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v30, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v29, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v30.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v31, v18, v19 :: v_dual_add_f32 v18, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v21, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0xffff, v17, v29
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v6, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0xffff, v19, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v21, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v19, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v32.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v17, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v8, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v35.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v20, v6, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v17, v8, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v8
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v34, v6, v17 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v6, 0xffff, v18, v33
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v34.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v36, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0xffff, v21, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v8, v36
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v5
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v8
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v49, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v17
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v10, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v10, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v17, v18, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v10
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v49.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v7, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v38.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v7, 0x40c00000, v7
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v10
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v19, v48
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v12, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v7, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v22, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v22, v22, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v23
+; GFX11-TRUE16-NEXT:    v_add3_u32 v24, v24, v12, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v50, 0x400000, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v39 :: v_dual_cndmask_b32 v52, v22, v37
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v9, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v9
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v14
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v65, v19, v25 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v9, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v52.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v53, v24, v50, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v7, v53
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v20, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v19, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v65.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 8, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v23, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v20, v9
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v21, v24, v19, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v25, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v25, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v67, v21, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v14, v25, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v23, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v13, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v9
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v68.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v15
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v23, v13, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v13
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v16, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v85, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v16, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v23, v21, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v24, v14, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v39, 0x400000, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v23, v23, v21, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v81, v13, v25, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v24, v14, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v14
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v37, v15, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v84, v23, v39, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v85.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v37, v15, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v70.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v14, 0xffff, v22, v67
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v19, v24, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v81.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v84.h
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0xffff, v23, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 24, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v13, v13, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v16, 0xffff, v19, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 8, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v15, 0xffff, v15, v13
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v21, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0xffff, v18, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v7
+; GFX11-TRUE16-NEXT:  .LBB54_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v112.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v2.h, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.h, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v14, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v49.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v10, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.l, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v83.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v16, v22
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v7.h, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v20.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v98.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v48.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v71.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v23, v24
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v70.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v69.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v16, v21
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v12, v22
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v14, v19
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v66.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.l, 8, v64.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.l, 8, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v54.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.h, v15.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v13.h, v16.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.h, v17.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v20, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v22, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v50.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v81.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v25, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v24, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v14
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff, v15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v21, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v22, v17
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v23, v24
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v25, v16
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v32bf16_to_v64i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB54_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[9:10]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[7:8]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[3:4]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[1:2]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 24, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v1
+; GFX11-FAKE16-NEXT:  .LBB54_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v18, 0x40c00000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v19, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v17, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v22, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v19, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v1, v1, v17, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add3_u32 v22, v22, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v20 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v24, v19, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v2, v21, v23 :: v_dual_lshlrev_b32 v21, 16, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v27, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v21, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v22, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v18
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v19, v20, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v18, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v24, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v26, v19, v17, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v18, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v19, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v18, v20, v22, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v97, 16, v26
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v19, v23 :: v_dual_lshlrev_b32 v23, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v19, v24, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_perm_b32 v29, v4, v18, 0x7060302
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v19, v19, v22 :: v_dual_lshlrev_b32 v22, 16, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v4
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v20, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v22, 16, 1
+; GFX11-FAKE16-NEXT:    v_perm_b32 v28, v3, v19, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v20, v20, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v28
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v20, v20, v23 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v25, v22, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v3, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v22, v6, v23, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v86, v3, v20, 0x7060302
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v21, v25 :: v_dual_add_f32 v8, 0x40c00000, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v6, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v85, v5, v22, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v85
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v21, v24, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v5, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v83, v5, v6, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v23, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v23
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v30, v23, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v23, v8, v24 :: v_dual_lshlrev_b32 v24, 16, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 0x40c00000, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v24, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v21, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v8, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v82, v7, v23, 0x7060302
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v8, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v82
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v21, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v31, v24, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 8, v82
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v7, v7, v30 :: v_dual_lshlrev_b32 v30, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v70, v7, v8, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v7
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v24, v10, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v10, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 8, v70
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v12, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v30, v12, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v21, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v21, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v21, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v25, v10, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v69, v9, v24, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v9, 0x40c00000, v21
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v25, v10, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v9, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v21, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v9
+; GFX11-FAKE16-NEXT:    v_add3_u32 v21, v32, v9, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 8, v69
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v30, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v30, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; GFX11-FAKE16-NEXT:    v_perm_b32 v55, v12, v10, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v21, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v30
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v31, v11, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 8, v55
+; GFX11-FAKE16-NEXT:    v_add3_u32 v25, v31, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v25, v30, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v25, 0x40c00000, v31
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v32, v21, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v32, v14, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v21, v30, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v32, v14, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v14
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v33, v25, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v14, v30, v31, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v30, 0x40c00000, v33
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; GFX11-FAKE16-NEXT:    v_perm_b32 v54, v11, v9, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v50, v14, v21, 0x7060302
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v25, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v13, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v33, v30, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11-FAKE16-NEXT:    v_add3_u32 v31, v35, v13, 0x7fff
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v24
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 8, v50
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v31, v32, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v34
+; GFX11-FAKE16-NEXT:    v_add3_u32 v32, v33, v30, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v33, 0x400000, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v34, v16, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v35, v31, 16, 1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v30, v15, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v36, 0x400000, v31
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v37, 0x400000, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v32, v32, v33, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v33, v34, v16, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v34, 0x400000, v16
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11-FAKE16-NEXT:    v_add3_u32 v35, v35, v31, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v30, v30, v15, 0x7fff
+; GFX11-FAKE16-NEXT:    v_perm_b32 v49, v13, v25, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v25
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v33, v34, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v31, v35, v36, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v20
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v54
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v30, v37, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v37, v16, v32, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v36, v15, v31, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v31
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 24, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[17:18], 24, v[36:37]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[18:19], 24, v[49:50]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[19:20], 24, v[54:55]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[20:21], 24, v[69:70]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[21:22], 24, v[82:83]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[22:23], 24, v[85:86]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[23:24], 24, v[28:29]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[24:25], 24, v[26:27]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 8, v37
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 8, v36
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v50
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 8, v49
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 24, v55
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 8, v54
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 24, v70
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v83
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 24, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v86
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v85
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v28, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v26
+; GFX11-FAKE16-NEXT:  .LBB54_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, v26, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v81
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v96
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v28, 8, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v26, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v81, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v83
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v68, 8, v82
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v81, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v28, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v26, v65, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v52
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v23, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v20, 8, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v54
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v53
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v23, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v25, v26
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v28, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v50
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v49
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v48
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v18, 8, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v23, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v22, 8, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v24, 8, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v26, 8, v32
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v24
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v25, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v21
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
+; GFX11-FAKE16-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -41241,409 +47913,795 @@ define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v64i8_to_v32bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8
-; GFX11-NEXT:    v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10
-; GFX11-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
-; GFX11-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v0, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_u16 v67, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_u16 v2, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_u16 v70, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_u16 v4, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_u16 v66, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_u16 v6, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_u16 v69, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_u16 v8, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_u16 v64, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_u16 v10, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_u16 v14, off, s32
-; GFX11-NEXT:    scratch_load_u16 v96, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v100, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v101, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u16 v102, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u16 v103, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u16 v113, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u16 v114, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u16 v115, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u16 v116, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_u16 v117, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_u16 v99, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_u16 v85, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_u16 v97, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_u16 v82, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u16 v86, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u16 v84, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u16 v87, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u16 v68, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u16 v80, off, s32 offset:20
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u16 v65, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v71, off, s32 offset:4
-; GFX11-NEXT:    v_lshlrev_b16 v50, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v51, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v49, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v39, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v54, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v48, 8, v11
-; GFX11-NEXT:    v_lshlrev_b16 v55, 8, v13
-; GFX11-NEXT:    v_lshlrev_b16 v52, 8, v15
-; GFX11-NEXT:    v_lshlrev_b16 v53, 8, v17
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v19
-; GFX11-NEXT:    v_lshlrev_b16 v83, 8, v21
-; GFX11-NEXT:    v_lshlrev_b16 v81, 8, v23
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v25
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(33)
-; GFX11-NEXT:    v_lshlrev_b16 v119, 8, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b16 v118, 8, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b16 v98, 8, v96
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v100
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b16 v96, 8, v101
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v102
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b16 v112, 8, v103
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b16 v103, 8, v113
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b16 v101, 8, v114
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b16 v100, 8, v115
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b16 v114, 8, v116
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b16 v102, 8, v117
-; GFX11-NEXT:    v_lshlrev_b16 v117, 8, v10
-; GFX11-NEXT:    v_lshlrev_b16 v113, 8, v8
-; GFX11-NEXT:    v_lshlrev_b16 v116, 8, v6
-; GFX11-NEXT:    v_lshlrev_b16 v115, 8, v4
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_4
-; GFX11-NEXT:  .LBB55_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB55_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v36
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v38
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v31
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v35
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v50
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v51
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v34
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v33
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v16
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v18
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v39
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v54
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v48
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v55
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v32
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v22
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v49
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v53
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v52
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v17
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v83
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v81
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v8, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v10, v9, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v71
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v65
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v80
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v68
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v87
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v84
-; GFX11-NEXT:    v_or_b32_e32 v6, v6, v21
-; GFX11-NEXT:    v_or_b32_e32 v7, v7, v19
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v27
-; GFX11-NEXT:    v_or_b32_e32 v9, v9, v23
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v98
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v25
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v96
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v29
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v112
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v103
-; GFX11-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v11, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v13, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v15, v14, 0x5040100
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v86
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v82
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v97
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v85
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v99
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v64
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v69
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v66
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xff, v70
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v67
-; GFX11-NEXT:    v_or_b32_e32 v11, v11, v101
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v100
-; GFX11-NEXT:    v_or_b32_e32 v13, v13, v114
-; GFX11-NEXT:    v_or_b32_e32 v14, v14, v102
-; GFX11-NEXT:    v_or_b32_e32 v15, v15, v117
-; GFX11-NEXT:    v_or_b32_e32 v16, v16, v113
-; GFX11-NEXT:    v_or_b32_e32 v17, v17, v116
-; GFX11-NEXT:    v_or_b32_e32 v18, v18, v115
-; GFX11-NEXT:    v_or_b32_e32 v19, v19, v118
-; GFX11-NEXT:    v_or_b32_e32 v20, v20, v119
-; GFX11-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v14, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v16, v15, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v18, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v20, v19, 0x5040100
-; GFX11-NEXT:    ; implicit-def: $vgpr36
-; GFX11-NEXT:    ; implicit-def: $vgpr37
-; GFX11-NEXT:    ; implicit-def: $vgpr33
-; GFX11-NEXT:    ; implicit-def: $vgpr38
-; GFX11-NEXT:    ; implicit-def: $vgpr31
-; GFX11-NEXT:    ; implicit-def: $vgpr35
-; GFX11-NEXT:    ; implicit-def: $vgpr34
-; GFX11-NEXT:    ; implicit-def: $vgpr32
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr18
-; GFX11-NEXT:    ; implicit-def: $vgpr20
-; GFX11-NEXT:    ; implicit-def: $vgpr22
-; GFX11-NEXT:    ; implicit-def: $vgpr24
-; GFX11-NEXT:    ; implicit-def: $vgpr26
-; GFX11-NEXT:    ; implicit-def: $vgpr28
-; GFX11-NEXT:    ; implicit-def: $vgpr30
-; GFX11-NEXT:    ; implicit-def: $vgpr71
-; GFX11-NEXT:    ; implicit-def: $vgpr65
-; GFX11-NEXT:    ; implicit-def: $vgpr80
-; GFX11-NEXT:    ; implicit-def: $vgpr68
-; GFX11-NEXT:    ; implicit-def: $vgpr87
-; GFX11-NEXT:    ; implicit-def: $vgpr84
-; GFX11-NEXT:    ; implicit-def: $vgpr86
-; GFX11-NEXT:    ; implicit-def: $vgpr82
-; GFX11-NEXT:    ; implicit-def: $vgpr97
-; GFX11-NEXT:    ; implicit-def: $vgpr85
-; GFX11-NEXT:    ; implicit-def: $vgpr99
-; GFX11-NEXT:    ; implicit-def: $vgpr64
-; GFX11-NEXT:    ; implicit-def: $vgpr69
-; GFX11-NEXT:    ; implicit-def: $vgpr66
-; GFX11-NEXT:    ; implicit-def: $vgpr70
-; GFX11-NEXT:    ; implicit-def: $vgpr67
-; GFX11-NEXT:    ; implicit-def: $vgpr50
-; GFX11-NEXT:    ; implicit-def: $vgpr51
-; GFX11-NEXT:    ; implicit-def: $vgpr49
-; GFX11-NEXT:    ; implicit-def: $vgpr39
-; GFX11-NEXT:    ; implicit-def: $vgpr54
-; GFX11-NEXT:    ; implicit-def: $vgpr48
-; GFX11-NEXT:    ; implicit-def: $vgpr55
-; GFX11-NEXT:    ; implicit-def: $vgpr52
-; GFX11-NEXT:    ; implicit-def: $vgpr53
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr83
-; GFX11-NEXT:    ; implicit-def: $vgpr81
-; GFX11-NEXT:    ; implicit-def: $vgpr21
-; GFX11-NEXT:    ; implicit-def: $vgpr19
-; GFX11-NEXT:    ; implicit-def: $vgpr27
-; GFX11-NEXT:    ; implicit-def: $vgpr23
-; GFX11-NEXT:    ; implicit-def: $vgpr98
-; GFX11-NEXT:    ; implicit-def: $vgpr25
-; GFX11-NEXT:    ; implicit-def: $vgpr96
-; GFX11-NEXT:    ; implicit-def: $vgpr29
-; GFX11-NEXT:    ; implicit-def: $vgpr112
-; GFX11-NEXT:    ; implicit-def: $vgpr103
-; GFX11-NEXT:    ; implicit-def: $vgpr101
-; GFX11-NEXT:    ; implicit-def: $vgpr100
-; GFX11-NEXT:    ; implicit-def: $vgpr114
-; GFX11-NEXT:    ; implicit-def: $vgpr102
-; GFX11-NEXT:    ; implicit-def: $vgpr117
-; GFX11-NEXT:    ; implicit-def: $vgpr113
-; GFX11-NEXT:    ; implicit-def: $vgpr116
-; GFX11-NEXT:    ; implicit-def: $vgpr115
-; GFX11-NEXT:    ; implicit-def: $vgpr118
-; GFX11-NEXT:    ; implicit-def: $vgpr119
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB55_2
-; GFX11-NEXT:  .LBB55_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v70, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v67, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v69, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v66, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, v64, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_add_nc_u16 v6, v97, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v118, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v119, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v99, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u16 v15, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v64, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v116, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; GFX11-NEXT:    v_or_b32_e32 v2, v115, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_add_nc_u16 v14, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v113, v1
-; GFX11-NEXT:    v_add_nc_u16 v66, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v114, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v117, v4
-; GFX11-NEXT:    v_add_nc_u16 v2, v85, 3
-; GFX11-NEXT:    v_add_nc_u16 v67, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_add_nc_u16 v0, v86, 3
-; GFX11-NEXT:    v_add_nc_u16 v12, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v82, 3
-; GFX11-NEXT:    v_add_nc_u16 v13, 0x300, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_add_nc_u16 v3, v87, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v84, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v102, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v101, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v100, v1
-; GFX11-NEXT:    v_add_nc_u16 v69, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v112, v3
-; GFX11-NEXT:    v_add_nc_u16 v11, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v103, v4
-; GFX11-NEXT:    v_add_nc_u16 v70, 0x300, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_add_nc_u16 v1, v80, 3
-; GFX11-NEXT:    v_add_nc_u16 v10, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v68, 3
-; GFX11-NEXT:    v_add_nc_u16 v68, 0x300, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v0, v71, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v3, v65, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, v28, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v96, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v29, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v0, v98, v0
-; GFX11-NEXT:    v_add_nc_u16 v9, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v25, v3
-; GFX11-NEXT:    v_add_nc_u16 v25, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v27, v4
-; GFX11-NEXT:    v_add_nc_u16 v8, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v30, 3
-; GFX11-NEXT:    v_add_nc_u16 v27, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v24, 3
-; GFX11-NEXT:    v_add_nc_u16 v7, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v26, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v3, v20, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_add_nc_u16 v4, v22, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v23, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v21, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v19, v2
-; GFX11-NEXT:    v_add_nc_u16 v19, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v83, v3
-; GFX11-NEXT:    v_add_nc_u16 v6, 0x300, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v81, v4
-; GFX11-NEXT:    v_add_nc_u16 v20, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v2, v16, 3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v0, v34, 3
-; GFX11-NEXT:    v_add_nc_u16 v16, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v31, 3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, v18, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_add_nc_u16 v4, v32, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v53, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v55, v0
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v54, v1
-; GFX11-NEXT:    v_add_nc_u16 v18, 0x300, v2
-; GFX11-NEXT:    v_or_b32_e32 v2, v17, v3
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v52, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v1, v33, 3
-; GFX11-NEXT:    v_add_nc_u16 v17, v36, 3
-; GFX11-NEXT:    v_add_nc_u16 v21, v37, 3
-; GFX11-NEXT:    v_add_nc_u16 v22, v38, 3
-; GFX11-NEXT:    v_add_nc_u16 v23, v35, 3
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; GFX11-NEXT:    v_or_b32_e32 v1, v49, v1
-; GFX11-NEXT:    v_or_b32_e32 v17, v50, v17
-; GFX11-NEXT:    v_or_b32_e32 v21, v51, v21
-; GFX11-NEXT:    v_or_b32_e32 v22, v39, v22
-; GFX11-NEXT:    v_or_b32_e32 v23, v48, v23
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v17, 0x300, v17
-; GFX11-NEXT:    v_add_nc_u16 v21, 0x300, v21
-; GFX11-NEXT:    v_add_nc_u16 v22, 0x300, v22
-; GFX11-NEXT:    v_add_nc_u16 v23, 0x300, v23
-; GFX11-NEXT:    v_add_nc_u16 v24, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v26, 0x300, v2
-; GFX11-NEXT:    v_perm_b32 v0, v21, v17, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v23, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v24, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v26, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v16, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v20, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v27, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v25, v9, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v68, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v70, v11, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v69, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v66, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v64, v15, 0x5040100
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v64i8_to_v32bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1f
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v55, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v55, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v53, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v31, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v53, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v52, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v31, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v54, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v39, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v48, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v48, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v50, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v51, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v52, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v54, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v64, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v64, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v65, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v38, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v36, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v38, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v35, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v37, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v36, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v37, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v34, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.l, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.l, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.l, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v25.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v26.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v30.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.l, 8, v39.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v28.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.l, 8, v29.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.h, 8, v55.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v55.l, 8, v55.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v53.l, 8, v53.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.h, 8, v52.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v39.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v49.h, 8, v48.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v39.h, 8, v48.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.l, 8, v50.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v48.h, 8, v50.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.h, 8, v51.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v52.l, 8, v52.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.l, 8, v54.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v50.h, 8, v64.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v54.l, 8, v64.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v51.l, 8, v65.l
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v66
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-TRUE16-NEXT:  .LBB55_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v21.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v35.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v34.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v36.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v37.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v38.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v33.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v23.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v23.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v1.h, v20.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v27.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v22.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.l, v27.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v25.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v26.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v5.l, v30.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v39.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v6.l, v28.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v29.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v49.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v29.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v49.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v39.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v9.l, v48.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v48.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v10.l, v51.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v52.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v11.l, v50.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v50.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v54.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v51.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v13.l, v54.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v52.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.l, v14.l, v53.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v53.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.l, v15.l, v55.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v55.h
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr18_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr24_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr31_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr28_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr49_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-TRUE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v33.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v32.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v32.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v31.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v31.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, v38.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v55.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v55.h, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v38.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v53.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v53.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v15.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v14.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v52.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v54.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v37.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v36.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v37.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v54.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v50.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v13.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v36.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v51.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v50.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v35.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v12.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v11.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v34.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v34.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v33.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v49.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v48.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v48.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v39.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v10.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v28.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v9.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v8.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v30.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v24.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v26.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v24.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v49.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v28.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v29.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v30.h, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v22.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v7.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v6.h, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v19.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v18.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v21.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v17.h, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v39.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v25.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v26.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v27.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v25.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v5.h, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v21.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v16.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v19.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v16.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v17.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v27.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v22.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v20.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v20.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v23.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v23.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, 0x300, v16.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v16.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, 0x300, v17.l
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v64i8_to_v32bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1f
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v0, off, s32 offset:128
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v67, off, s32 offset:124
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s32 offset:120
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v70, off, s32 offset:116
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v4, off, s32 offset:112
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v66, off, s32 offset:108
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v6, off, s32 offset:104
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v69, off, s32 offset:100
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v8, off, s32 offset:96
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v64, off, s32 offset:92
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v10, off, s32 offset:88
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v14, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v96, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v100, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v101, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v102, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v103, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v113, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v114, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v115, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v116, off, s32 offset:72
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v117, off, s32 offset:80
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v99, off, s32 offset:84
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v85, off, s32 offset:76
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v97, off, s32 offset:68
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v82, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v86, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v84, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v87, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v68, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v80, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v65, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v71, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v50, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v51, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v49, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v39, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v54, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v48, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v55, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v52, 8, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v53, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v83, 8, v21
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v81, 8, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v25
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v29
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(33)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v119, 8, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v118, 8, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v14
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v98, 8, v96
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v100
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v96, 8, v101
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v102
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v112, 8, v103
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v103, 8, v113
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v101, 8, v114
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v100, 8, v115
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v114, 8, v116
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v102, 8, v117
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v117, 8, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v113, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v116, 8, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v115, 8, v4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-FAKE16-NEXT:  .LBB55_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v36
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v37
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v38
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v35
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v50
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v51
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v33
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v39
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v54
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v48
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v55
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v32
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v49
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v53
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v52
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v83
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v81
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v8, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v10, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v71
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v65
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v80
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v68
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v87
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v84
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v19
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v98
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v96
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v112
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v103
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v9, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v11, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v13, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v15, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v86
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v82
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v97
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v85
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v99
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v64
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v69
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v66
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xff, v70
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v67
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v101
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v100
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v13, v114
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v14, v102
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v117
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v16, v113
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v17, v116
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, v18, v115
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v19, v19, v118
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v20, v20, v119
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v14, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v16, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v18, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v20, v19, 0x5040100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr36
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr37
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr33
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr38
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr31
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr35
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr34
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr32
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr18
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr20
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr22
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr24
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr26
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr28
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr30
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr71
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr65
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr80
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr68
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr87
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr84
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr86
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr82
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr97
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr99
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr64
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr69
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr66
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr70
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr67
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr50
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr51
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr49
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr39
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr54
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr48
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr55
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr52
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr53
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr83
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr81
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr21
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr19
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr27
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr23
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr98
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr25
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr96
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr29
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr112
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr103
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr101
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr100
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr114
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr102
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr117
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr113
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr116
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr115
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr118
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr119
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-FAKE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v70, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v67, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v69, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v66, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, v64, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v97, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v118, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v119, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v99, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v15, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v64, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v116, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v115, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v14, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v113, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v66, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v114, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v117, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v85, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v67, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v86, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v12, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v82, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v13, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v87, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v84, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v102, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v101, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v100, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v69, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v112, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v11, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v103, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v70, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v80, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v10, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v68, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v68, 0x300, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v71, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v65, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v28, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v96, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v29, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v98, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v9, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v25, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v25, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v27, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v30, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v27, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v24, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v7, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v26, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v20, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v22, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v23, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v21, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v19, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v19, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v83, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v81, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v20, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v16, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v34, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v16, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v31, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v18, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v32, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v53, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v55, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v54, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v18, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v17, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v52, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v33, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, v36, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, v37, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, v38, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, v35, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v21, 0xff, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v23, 0xff, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v49, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v50, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, v51, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v22, v39, v22
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v23, v48, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v17, 0x300, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v21, 0x300, v21
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v22, 0x300, v22
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v23, 0x300, v23
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v24, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v26, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v21, v17, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v22, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v23, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v24, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v26, v18, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v16, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v20, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v8, v27, v8, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v9, v25, v9, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v68, v10, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v70, v11, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v12, v69, v12, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v14, v66, v14, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v15, v64, v15, 0x5040100
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 26ce1771e220d..42b2f9a168cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define double @bitcast_i64_to_f64(i64 %a, i32 %b) {
 ; GCN-LABEL: bitcast_i64_to_f64:
@@ -1104,52 +1105,106 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB11_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:  .LBB11_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1285,47 +1340,83 @@ define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_i64_to_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  ; %bb.2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB12_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:  .LBB12_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_i64_to_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_i64_to_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB12_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v8, vcc_lo, v8, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:  .LBB12_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1524,90 +1615,176 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i8_to_i64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v9, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB13_4
-; GFX11-NEXT:  .LBB13_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB13_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB13_2
-; GFX11-NEXT:  .LBB13_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v9, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v2, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i8_to_i64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-TRUE16-NEXT:  .LBB13_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-TRUE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i8_to_i64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB13_4
+; GFX11-FAKE16-NEXT:  .LBB13_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB13_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-FAKE16-NEXT:  .LBB13_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v2, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2548,52 +2725,106 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB23_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:  .LBB23_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:  .LBB23_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2727,45 +2958,80 @@ define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_f64_to_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  ; %bb.2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  .LBB24_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_f64_to_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  .LBB24_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_f64_to_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  .LBB24_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2964,90 +3230,176 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i8_to_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v9, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB25_4
-; GFX11-NEXT:  .LBB25_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB25_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB25_2
-; GFX11-NEXT:  .LBB25_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v9, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v2, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i8_to_f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-TRUE16-NEXT:  .LBB25_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-TRUE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i8_to_f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB25_4
+; GFX11-FAKE16-NEXT:  .LBB25_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB25_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB25_2
+; GFX11-FAKE16-NEXT:  .LBB25_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v2, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3861,52 +4213,106 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v2i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB33_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:  .LBB33_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:  .LBB33_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB33_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB33_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4042,46 +4448,81 @@ define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2i32_to_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  ; %bb.2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB34_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 3, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  .LBB34_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2i32_to_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2i32_to_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB34_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 3, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  .LBB34_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4280,90 +4721,176 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i8_to_v2i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v9, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB35_4
-; GFX11-NEXT:  .LBB35_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB35_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB35_2
-; GFX11-NEXT:  .LBB35_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v9, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v2, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i8_to_v2i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-TRUE16-NEXT:  .LBB35_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-TRUE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i8_to_v2i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB35_4
+; GFX11-FAKE16-NEXT:  .LBB35_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB35_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB35_2
+; GFX11-FAKE16-NEXT:  .LBB35_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v2, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5025,52 +5552,106 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v2f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB41_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v3
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
-; GFX11-NEXT:  .LBB41_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v2f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v3
+; GFX11-TRUE16-NEXT:  .LBB41_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v2f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB41_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB41_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5206,45 +5787,79 @@ define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v2f32_to_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  ; %bb.2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB42_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  .LBB42_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v2f32_to_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v2f32_to_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB42_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  .LBB42_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5443,90 +6058,176 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i8_to_v2f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v9, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB43_4
-; GFX11-NEXT:  .LBB43_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB43_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB43_2
-; GFX11-NEXT:  .LBB43_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v9, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v2, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i8_to_v2f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-TRUE16-NEXT:  .LBB43_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-TRUE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i8_to_v2f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX11-FAKE16-NEXT:  .LBB43_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB43_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB43_2
+; GFX11-FAKE16-NEXT:  .LBB43_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v2, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -6006,53 +6707,109 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v4i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB47_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
-; GFX11-NEXT:  .LBB47_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-TRUE16-NEXT:  .LBB47_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB47_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB47_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -6216,46 +6973,81 @@ define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4i16_to_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  ; %bb.2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB48_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  .LBB48_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4i16_to_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4i16_to_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB48_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  .LBB48_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -6462,90 +7254,176 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i8_to_v4i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v9, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB49_4
-; GFX11-NEXT:  .LBB49_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB49_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB49_2
-; GFX11-NEXT:  .LBB49_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v9, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v2, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i8_to_v4i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-TRUE16-NEXT:  .LBB49_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-TRUE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i8_to_v4i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB49_4
+; GFX11-FAKE16-NEXT:  .LBB49_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB49_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB49_2
+; GFX11-FAKE16-NEXT:  .LBB49_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v2, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -6833,53 +7711,107 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB51_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
-; GFX11-NEXT:  .LBB51_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v1, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT:  .LBB51_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB51_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB51_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7044,46 +7976,81 @@ define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4f16_to_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  ; %bb.2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB52_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  .LBB52_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4f16_to_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:  ; %bb.4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v9.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4f16_to_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB52_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  .LBB52_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7282,90 +8249,176 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i8_to_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v9, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB53_4
-; GFX11-NEXT:  .LBB53_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB53_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB53_2
-; GFX11-NEXT:  .LBB53_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v9, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v2, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i8_to_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-TRUE16-NEXT:  .LBB53_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-TRUE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i8_to_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB53_4
+; GFX11-FAKE16-NEXT:  .LBB53_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB53_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB53_2
+; GFX11-FAKE16-NEXT:  .LBB53_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v2, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7592,79 +8645,157 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v9
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v4bf16_to_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX11-NEXT:  ; %bb.2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB54_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v8
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v9, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v8, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v8, v8, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v8, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_perm_b32 v10, v1, v0, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
-; GFX11-NEXT:    v_perm_b32 v11, v2, v3, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
-; GFX11-NEXT:  .LBB54_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v9
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v4bf16_to_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v2
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v9.h
+; GFX11-TRUE16-NEXT:  .LBB54_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v8.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v0 :: v_dual_lshlrev_b32 v0, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v12, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v12, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0xffff, v3, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v9, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[10:11], 24, v[8:9]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:  .LBB54_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v4bf16_to_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB54_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v8, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v10, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v2, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[10:11]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; GFX11-FAKE16-NEXT:  .LBB54_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v8
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v9
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -7867,90 +8998,176 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v8i8_to_v4bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v9, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB55_4
-; GFX11-NEXT:  .LBB55_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB55_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v7
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB55_2
-; GFX11-NEXT:  .LBB55_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v9, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v2, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v0, v8, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v7, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v8i8_to_v4bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v8
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-TRUE16-NEXT:  .LBB55_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-TRUE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v5.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v8i8_to_v4bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB55_4
+; GFX11-FAKE16-NEXT:  .LBB55_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB55_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB55_2
+; GFX11-FAKE16-NEXT:  .LBB55_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v9, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v2, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index b87e7b0916032..852114f2ba12f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) {
 ; GCN-LABEL: bitcast_v3i32_to_v3f32:
@@ -317,57 +318,105 @@ define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v3i32_to_v12i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_mov_b32_e32 v14, v1
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; GFX11-NEXT:  .LBB2_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 3, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, 3, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 3, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; GFX11-NEXT:  .LBB2_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v13
-; GFX11-NEXT:    v_mov_b32_e32 v4, v14
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3i32_to_v12i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v11
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, 3, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 3, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 3, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v11
+; GFX11-TRUE16-NEXT:  .LBB2_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v13.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3i32_to_v12i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
+; GFX11-FAKE16-NEXT:  .LBB2_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 3, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 3, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
+; GFX11-FAKE16-NEXT:  .LBB2_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v13
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v14
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -627,102 +676,203 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v12i8_to_v3i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB3_4
-; GFX11-NEXT:  .LBB3_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB3_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v4
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB3_2
-; GFX11-NEXT:  .LBB3_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v13, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v12, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v15, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v8
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v5
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12i8_to_v3i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX11-TRUE16-NEXT:  .LBB3_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB3_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB3_2
+; GFX11-TRUE16-NEXT:  .LBB3_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v7.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v7.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v3.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12i8_to_v3i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX11-FAKE16-NEXT:  .LBB3_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB3_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB3_2
+; GFX11-FAKE16-NEXT:  .LBB3_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v13, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v15, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v5
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1036,70 +1186,140 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v3i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB5_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v6, v10, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v9, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v5, v9, v10
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v11, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT:  .LBB5_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB5_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v3, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT:  .LBB5_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB5_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB5_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -1741,56 +1961,103 @@ define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v3f32_to_v12i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_mov_b32_e32 v14, v1
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB10_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; GFX11-NEXT:  .LBB10_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB10_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; GFX11-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; GFX11-NEXT:  .LBB10_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v13
-; GFX11-NEXT:    v_mov_b32_e32 v4, v14
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v3f32_to_v12i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v11
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v11
+; GFX11-TRUE16-NEXT:  .LBB10_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v13.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v3f32_to_v12i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB10_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
+; GFX11-FAKE16-NEXT:  .LBB10_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB10_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
+; GFX11-FAKE16-NEXT:  .LBB10_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v13
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v14
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2050,102 +2317,203 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v12i8_to_v3f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB11_4
-; GFX11-NEXT:  .LBB11_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB11_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v16
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v7
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v4
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB11_2
-; GFX11-NEXT:  .LBB11_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v13, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v8, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_or_b32_e32 v0, v16, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v12, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v15, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v7, v4
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v8
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v5
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12i8_to_v3f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX11-TRUE16-NEXT:  .LBB11_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB11_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v3, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-TRUE16-NEXT:  .LBB11_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v7.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v7.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v5.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v3.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12i8_to_v3f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB11_4
+; GFX11-FAKE16-NEXT:  .LBB11_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB11_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB11_2
+; GFX11-FAKE16-NEXT:  .LBB11_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v13, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v8, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v16, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v12, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v15, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v5
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -2458,70 +2826,140 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v3f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB13_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v6, v10, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_f32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v9, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v5, v9, v10
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v6, v11, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
-; GFX11-NEXT:  .LBB13_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v3f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v3, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v8, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v5
+; GFX11-TRUE16-NEXT:  .LBB13_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v3f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB13_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v5, v9, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v6, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB13_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3256,102 +3694,206 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v12i8_to_v6bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
-; GFX11-NEXT:  .LBB18_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB18_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v17
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v12
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB18_2
-; GFX11-NEXT:  .LBB18_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v13, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v0, v17, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v15, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v16, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v12, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12i8_to_v6bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-TRUE16-NEXT:  .LBB18_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB18_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB18_2
+; GFX11-TRUE16-NEXT:  .LBB18_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v8.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v7.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12i8_to_v6bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB18_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB18_4
+; GFX11-FAKE16-NEXT:  .LBB18_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB18_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB18_2
+; GFX11-FAKE16-NEXT:  .LBB18_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v13, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v17, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v15, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -3656,108 +4198,217 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v12i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_mov_b32_e32 v14, v1
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB19_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; GFX11-NEXT:  .LBB19_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB19_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_mov_b32 v12, 0x7fc07fc0 :: v_dual_lshlrev_b32 v3, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
-; GFX11-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v14
-; GFX11-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; GFX11-NEXT:    v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v0, 0x40c00000, v4
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_add3_u32 v1, v10, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v4, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v4, v8, v3, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v1, v5, 16, 1
-; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v1, v1, v5, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v1, v11, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v6, v7, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v11, v4, v5, 0x7060302
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:  .LBB19_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v13
-; GFX11-NEXT:    v_mov_b32_e32 v4, v14
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v12i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v13, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr16_lo16
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB19_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[12:13]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v10.l
+; GFX11-TRUE16-NEXT:  .LBB19_2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB19_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v13
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v13
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v0, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v7, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v12
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v8, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v12, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v13, 0xffff, v1, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v12, 0xffff, v5, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, 0x7fc07fc0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v8.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[12:13]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v10, 0xffff, v7, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[16:17], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:  .LBB19_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v8.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v16.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v12i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB19_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
+; GFX11-FAKE16-NEXT:  .LBB19_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB19_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v13
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, 0x7fc07fc0 :: v_dual_lshlrev_b32 v3, 16, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v14
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v1, v10, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v8, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_add3_u32 v1, v1, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v3, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v1, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v6, v7, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v11, v4, v5, 0x7060302
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[0:1]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[11:12]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:  .LBB19_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v13
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v14
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4012,102 +4663,206 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v12i8_to_v6f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB20_4
-; GFX11-NEXT:  .LBB20_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB20_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v17
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v12
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB20_2
-; GFX11-NEXT:  .LBB20_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v13, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v0, v17, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v15, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v16, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v12, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12i8_to_v6f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB20_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB20_4
+; GFX11-TRUE16-NEXT:  .LBB20_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB20_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB20_2
+; GFX11-TRUE16-NEXT:  .LBB20_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v8.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v7.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12i8_to_v6f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB20_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB20_4
+; GFX11-FAKE16-NEXT:  .LBB20_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB20_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB20_2
+; GFX11-FAKE16-NEXT:  .LBB20_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v13, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v17, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v15, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4319,60 +5074,109 @@ define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v8, v13
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6f16_to_v12i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v15, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB21_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[15:16]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v15
-; GFX11-NEXT:  .LBB21_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB21_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_mov_b32_e32 v14, 0x7e007e00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v16
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[15:16]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v15
-; GFX11-NEXT:  .LBB21_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v15
-; GFX11-NEXT:    v_mov_b32_e32 v4, v16
-; GFX11-NEXT:    v_mov_b32_e32 v8, v13
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6f16_to_v12i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v13, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[12:13]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB21_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, 0x7e007e00
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[12:13]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[14:15], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
+; GFX11-TRUE16-NEXT:  .LBB21_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v14.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6f16_to_v12i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB21_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v15
+; GFX11-FAKE16-NEXT:  .LBB21_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB21_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, 0x7e007e00
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[15:16]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v15
+; GFX11-FAKE16-NEXT:  .LBB21_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v15
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v13
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4644,102 +5448,206 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v12i8_to_v6i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v1
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v3
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v5
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v7
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v11
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_3
-; GFX11-NEXT:  ; %bb.1: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB22_4
-; GFX11-NEXT:  .LBB22_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB22_3: ; %cmp.false
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v13
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v10
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v17
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v15
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v16
-; GFX11-NEXT:    v_or_b32_e32 v3, v3, v12
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    ; implicit-def: $vgpr13
-; GFX11-NEXT:    ; implicit-def: $vgpr14
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr8
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr17
-; GFX11-NEXT:    ; implicit-def: $vgpr15
-; GFX11-NEXT:    ; implicit-def: $vgpr16
-; GFX11-NEXT:    ; implicit-def: $vgpr12
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB22_2
-; GFX11-NEXT:  .LBB22_4: ; %cmp.true
-; GFX11-NEXT:    v_add_nc_u16 v0, v13, 3
-; GFX11-NEXT:    v_add_nc_u16 v1, v14, 3
-; GFX11-NEXT:    v_add_nc_u16 v2, v4, 3
-; GFX11-NEXT:    v_add_nc_u16 v3, v6, 3
-; GFX11-NEXT:    v_add_nc_u16 v4, v8, 3
-; GFX11-NEXT:    v_add_nc_u16 v6, v10, 3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_or_b32_e32 v0, v17, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v15, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, v16, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, v12, v3
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_add_nc_u16 v0, 0x300, v0
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x300, v1
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x300, v2
-; GFX11-NEXT:    v_add_nc_u16 v3, 0x300, v3
-; GFX11-NEXT:    v_add_nc_u16 v4, 0x300, v4
-; GFX11-NEXT:    v_add_nc_u16 v5, 0x300, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v5
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v12i8_to_v6i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 8, v9.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v12
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB22_3
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB22_4
+; GFX11-TRUE16-NEXT:  .LBB22_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB22_3: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v0.h, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v6
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v4, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr8_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr6_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB22_2
+; GFX11-TRUE16-NEXT:  .LBB22_4: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v9.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v8.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, v7.h, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.h, v7.l, 3
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v4.h, v2.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v3.l, 0x300, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x300, v1.h
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v4.l, 0x300, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v5, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v12i8_to_v6i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v11
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB22_3
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB22_4
+; GFX11-FAKE16-NEXT:  .LBB22_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB22_3: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr13
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr14
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr8
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr17
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr15
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr16
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr12
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB22_2
+; GFX11-FAKE16-NEXT:  .LBB22_4: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v13, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, v14, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v4, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v6, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, v8, 3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v6, v10, 3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v17, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v15, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, 0x300, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x300, v1
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x300, v2
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, 0x300, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v4, 0x300, v4
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v5, 0x300, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v5
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -4954,57 +5862,105 @@ define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, v14
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6i16_to_v12i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
-; GFX11-NEXT:    v_mov_b32_e32 v14, v1
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    ; implicit-def: $vgpr1
-; GFX11-NEXT:    ; implicit-def: $vgpr2
-; GFX11-NEXT:    ; implicit-def: $vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr5
-; GFX11-NEXT:    ; implicit-def: $vgpr6
-; GFX11-NEXT:    ; implicit-def: $vgpr7
-; GFX11-NEXT:    ; implicit-def: $vgpr9
-; GFX11-NEXT:    ; implicit-def: $vgpr10
-; GFX11-NEXT:    ; implicit-def: $vgpr11
-; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB23_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.false
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; GFX11-NEXT:  .LBB23_2: ; %Flow
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB23_4
-; GFX11-NEXT:  ; %bb.3: ; %cmp.true
-; GFX11-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; GFX11-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
-; GFX11-NEXT:  .LBB23_4: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    v_mov_b32_e32 v0, v13
-; GFX11-NEXT:    v_mov_b32_e32 v4, v14
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6i16_to_v12i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr9_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v11
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %Flow
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB23_4
+; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[13:14], 24, v[10:11]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v11
+; GFX11-TRUE16-NEXT:  .LBB23_4: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v13.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6i16_to_v12i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr1
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr9
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr10
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr11
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB23_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.false
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
+; GFX11-FAKE16-NEXT:  .LBB23_2: ; %Flow
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB23_4
+; GFX11-FAKE16-NEXT:  ; %bb.3: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[13:14]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[11:12], 24, v[8:9]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v13
+; GFX11-FAKE16-NEXT:  .LBB23_4: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v13
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v14
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5225,72 +6181,140 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v6f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB24_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT:    v_add3_u32 v8, v8, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v9, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v13, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
-; GFX11-NEXT:  .LBB24_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v3
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, v8, v12 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v11, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v4
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v5
+; GFX11-TRUE16-NEXT:  .LBB24_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB24_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v9, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB24_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 
@@ -5641,72 +6665,146 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) {
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bitcast_v6bf16_to_v6i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, exec_lo
-; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v3
-; GFX11-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB26_2
-; GFX11-NEXT:  ; %bb.1: ; %cmp.true
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX11-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX11-NEXT:    v_add3_u32 v8, v8, v4, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX11-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v9, v1, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v13, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
-; GFX11-NEXT:  .LBB26_2: ; %end
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bitcast_v6bf16_to_v6i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-TRUE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB26_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v5, 16, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v6, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v8, v12 :: v_dual_add_f32 v2, 0x40c00000, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v7, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v2, v2, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-TRUE16-NEXT:  .LBB26_2: ; %end
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bitcast_v6bf16_to_v6i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; GFX11-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v3
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB26_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %cmp.true
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v4, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v8, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v8, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add3_u32 v8, v9, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-FAKE16-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
+; GFX11-FAKE16-NEXT:  .LBB26_2: ; %end
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %b, 0
   br i1 %cmp, label %cmp.true, label %cmp.false
 

diff  --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index c019c83da5ef8..f979d01e495ba 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -4,19 +4,27 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_ITERATIVE,GFX1132_ITERATIVE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_ITERATIVE,GFX1264_ITERATIVE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_ITERATIVE,GFX1232_ITERATIVE-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_DPP,GFX1164_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-FAKE16,GFX1164_DPP,GFX1164_DPP-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-TRUE16,GFX1132_DPP,GFX1132_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132-FAKE16,GFX1132_DPP,GFX1132_DPP-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-TRUE16,GFX1264_DPP,GFX1264_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1264,GFX1264-FAKE16,GFX1264_DPP,GFX1264_DPP-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-TRUE16,GFX1232_DPP,GFX1232_DPP-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1232,GFX1232-FAKE16,GFX1232_DPP,GFX1232_DPP-FAKE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -7289,163 +7297,325 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX1032-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: uniform_or_i8:
-; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1164-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB12_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_and_b32 s7, s2, 3
-; GFX1164-NEXT:    s_and_b32 s8, s6, 0xff
-; GFX1164-NEXT:    s_lshl_b32 s7, s7, 3
-; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT:    s_lshl_b32 s9, s8, s7
-; GFX1164-NEXT:    s_and_b32 s8, s2, -4
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s9
-; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    s_mov_b32 s9, s3
-; GFX1164-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
-; GFX1164-NEXT:  .LBB12_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_endpgm
-;
-; GFX1132-LABEL: uniform_or_i8:
-; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1132-NEXT:    s_load_b32 s4, s[4:5], 0x34
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB12_2
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_and_b32 s6, s2, 3
-; GFX1132-NEXT:    s_and_b32 s7, s4, 0xff
-; GFX1132-NEXT:    s_lshl_b32 s6, s6, 3
-; GFX1132-NEXT:    s_and_b32 s8, s2, -4
-; GFX1132-NEXT:    s_lshl_b32 s7, s7, s6
-; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    s_mov_b32 s9, s3
-; GFX1132-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
-; GFX1132-NEXT:  .LBB12_2:
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_endpgm
-;
-; GFX1264-LABEL: uniform_or_i8:
-; GFX1264:       ; %bb.0:
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1264-NEXT:    ; implicit-def: $vgpr0
-; GFX1264-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1264-NEXT:    s_cbranch_execz .LBB12_2
-; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_and_b32 s7, s2, 3
-; GFX1264-NEXT:    s_and_b32 s8, s6, 0xff
-; GFX1264-NEXT:    s_lshl_b32 s7, s7, 3
-; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT:    s_lshl_b32 s9, s8, s7
-; GFX1264-NEXT:    s_and_b32 s8, s2, -4
-; GFX1264-NEXT:    v_mov_b32_e32 v0, s9
-; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    s_mov_b32 s9, s3
-; GFX1264-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
-; GFX1264-NEXT:  .LBB12_2:
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    buffer_store_b8 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_endpgm
-;
-; GFX1232-LABEL: uniform_or_i8:
-; GFX1232:       ; %bb.0:
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_load_b32 s4, s[4:5], 0x34
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1232-NEXT:    ; implicit-def: $vgpr0
-; GFX1232-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT:    s_cbranch_execz .LBB12_2
-; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_and_b32 s6, s2, 3
-; GFX1232-NEXT:    s_and_b32 s7, s4, 0xff
-; GFX1232-NEXT:    s_lshl_b32 s6, s6, 3
-; GFX1232-NEXT:    s_and_b32 s8, s2, -4
-; GFX1232-NEXT:    s_lshl_b32 s7, s7, s6
-; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    s_mov_b32 s9, s3
-; GFX1232-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
-; GFX1232-NEXT:  .LBB12_2:
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    buffer_store_b8 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_endpgm
+; GFX1164-TRUE16-LABEL: uniform_or_i8:
+; GFX1164-TRUE16:       ; %bb.0:
+; GFX1164-TRUE16-NEXT:    s_clause 0x1
+; GFX1164-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1164-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1164-TRUE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1164-TRUE16-NEXT:  ; %bb.1:
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s8, s6
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1164-TRUE16-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1164-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1164-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1164-TRUE16-NEXT:  .LBB12_2:
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-TRUE16-NEXT:    v_cndmask_b16 v0.l, s6, 0, vcc
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    s_endpgm
+;
+; GFX1164-FAKE16-LABEL: uniform_or_i8:
+; GFX1164-FAKE16:       ; %bb.0:
+; GFX1164-FAKE16-NEXT:    s_clause 0x1
+; GFX1164-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1164-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1164-FAKE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1164-FAKE16-NEXT:  ; %bb.1:
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1164-FAKE16-NEXT:    s_and_b32 s8, s6, 0xff
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1164-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1164-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1164-FAKE16-NEXT:  .LBB12_2:
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1164-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132-TRUE16-LABEL: uniform_or_i8:
+; GFX1132-TRUE16:       ; %bb.0:
+; GFX1132-TRUE16-NEXT:    s_clause 0x1
+; GFX1132-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1132-TRUE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1132-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1132-TRUE16-NEXT:  ; %bb.1:
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, s4
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1132-TRUE16-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX1132-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1132-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1132-TRUE16-NEXT:  .LBB12_2:
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, 0, vcc_lo
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132-FAKE16-LABEL: uniform_or_i8:
+; GFX1132-FAKE16:       ; %bb.0:
+; GFX1132-FAKE16-NEXT:    s_clause 0x1
+; GFX1132-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-FAKE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1132-FAKE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1132-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1132-FAKE16-NEXT:  ; %bb.1:
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s7, s4, 0xff
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1132-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1132-FAKE16-NEXT:  .LBB12_2:
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1132-FAKE16-NEXT:    s_endpgm
+;
+; GFX1264-TRUE16-LABEL: uniform_or_i8:
+; GFX1264-TRUE16:       ; %bb.0:
+; GFX1264-TRUE16-NEXT:    s_clause 0x1
+; GFX1264-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1264-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1264-TRUE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1264-TRUE16-NEXT:  ; %bb.1:
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s8, s6
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1264-TRUE16-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1264-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1264-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1264-TRUE16-NEXT:  .LBB12_2:
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-TRUE16-NEXT:    v_cndmask_b16 v0.l, s6, 0, vcc
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    s_endpgm
+;
+; GFX1264-FAKE16-LABEL: uniform_or_i8:
+; GFX1264-FAKE16:       ; %bb.0:
+; GFX1264-FAKE16-NEXT:    s_clause 0x1
+; GFX1264-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1264-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1264-FAKE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1264-FAKE16-NEXT:  ; %bb.1:
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1264-FAKE16-NEXT:    s_and_b32 s8, s6, 0xff
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1264-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1264-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1264-FAKE16-NEXT:  .LBB12_2:
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1264-FAKE16-NEXT:    s_endpgm
+;
+; GFX1232-TRUE16-LABEL: uniform_or_i8:
+; GFX1232-TRUE16:       ; %bb.0:
+; GFX1232-TRUE16-NEXT:    s_clause 0x1
+; GFX1232-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1232-TRUE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1232-TRUE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1232-TRUE16-NEXT:  ; %bb.1:
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, s4
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1232-TRUE16-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX1232-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1232-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1232-TRUE16-NEXT:  .LBB12_2:
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, 0, vcc_lo
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    s_endpgm
+;
+; GFX1232-FAKE16-LABEL: uniform_or_i8:
+; GFX1232-FAKE16:       ; %bb.0:
+; GFX1232-FAKE16-NEXT:    s_clause 0x1
+; GFX1232-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-FAKE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1232-FAKE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1232-FAKE16-NEXT:    s_cbranch_execz .LBB12_2
+; GFX1232-FAKE16-NEXT:  ; %bb.1:
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s7, s4, 0xff
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1232-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1232-FAKE16-NEXT:  .LBB12_2:
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
   store i8 %rmw, ptr addrspace(1) %result
   ret void
@@ -7745,262 +7915,524 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX1032-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: uniform_add_i8:
-; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1164-NEXT:    s_load_b32 s10, s[4:5], 0x34
-; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1164-NEXT:    s_cbranch_execz .LBB13_4
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_and_b32 s4, s2, -4
-; GFX1164-NEXT:    s_mov_b32 s5, s3
-; GFX1164-NEXT:    s_and_b32 s2, s2, 3
-; GFX1164-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164-NEXT:    s_lshl_b32 s11, s2, 3
-; GFX1164-NEXT:    s_mul_i32 s2, s10, s6
-; GFX1164-NEXT:    s_lshl_b32 s12, 0xff, s11
-; GFX1164-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX1164-NEXT:    s_not_b32 s13, s12
-; GFX1164-NEXT:    s_lshl_b32 s14, s2, s11
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s14, v1
-; GFX1164-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-NEXT:    v_and_or_b32 v0, v1, s13, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1164-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
-; GFX1164-NEXT:  .LBB13_4: ; %Flow
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u16 v0, s10, v4, s2
-; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_endpgm
-;
-; GFX1132-LABEL: uniform_add_i8:
-; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1132-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1132-NEXT:    s_mov_b32 s10, 0
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
-; GFX1132-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1132-NEXT:    s_cbranch_execz .LBB13_4
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_and_b32 s4, s2, -4
-; GFX1132-NEXT:    s_mov_b32 s5, s3
-; GFX1132-NEXT:    s_and_b32 s2, s2, 3
-; GFX1132-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1132-NEXT:    s_bcnt1_i32_b32 s6, s6
-; GFX1132-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1132-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1132-NEXT:    s_lshl_b32 s3, 0xff, s2
-; GFX1132-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX1132-NEXT:    s_not_b32 s11, s3
-; GFX1132-NEXT:    s_lshl_b32 s12, s6, s2
-; GFX1132-NEXT:    s_mov_b32 s6, -1
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1132-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s12, v1
-; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    s_or_b32 s10, vcc_lo, s10
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
-; GFX1132-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1132-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-NEXT:  .LBB13_4: ; %Flow
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u16 v0, s8, v4, s2
-; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_endpgm
-;
-; GFX1264-LABEL: uniform_add_i8:
-; GFX1264:       ; %bb.0:
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-NEXT:    s_load_b32 s10, s[4:5], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX1264-NEXT:    ; implicit-def: $vgpr0
-; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1264-NEXT:    s_cbranch_execz .LBB13_4
-; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_and_b32 s4, s2, -4
-; GFX1264-NEXT:    s_mov_b32 s5, s3
-; GFX1264-NEXT:    s_and_b32 s2, s2, 3
-; GFX1264-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1264-NEXT:    s_lshl_b32 s11, s2, 3
-; GFX1264-NEXT:    s_wait_alu 0xfffe
-; GFX1264-NEXT:    s_mul_i32 s2, s10, s6
-; GFX1264-NEXT:    s_lshl_b32 s12, 0xff, s11
-; GFX1264-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX1264-NEXT:    s_not_b32 s13, s12
-; GFX1264-NEXT:    s_lshl_b32 s14, s2, s11
-; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s6, -1
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1264-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1264-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_add_nc_u32_e32 v0, s14, v1
-; GFX1264-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT:    v_and_or_b32 v0, v1, s13, v0
-; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1264-NEXT:    s_wait_alu 0xfffe
-; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1264-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
-; GFX1264-NEXT:  .LBB13_4: ; %Flow
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT:    s_wait_alu 0xf1ff
-; GFX1264-NEXT:    v_mad_u16 v0, s10, v4, s2
-; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    buffer_store_b8 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_endpgm
-;
-; GFX1232-LABEL: uniform_add_i8:
-; GFX1232:       ; %bb.0:
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1232-NEXT:    s_mov_b32 s10, 0
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
-; GFX1232-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1232-NEXT:    ; implicit-def: $vgpr0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1232-NEXT:    s_cbranch_execz .LBB13_4
-; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_and_b32 s4, s2, -4
-; GFX1232-NEXT:    s_mov_b32 s5, s3
-; GFX1232-NEXT:    s_and_b32 s2, s2, 3
-; GFX1232-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1232-NEXT:    s_bcnt1_i32_b32 s6, s6
-; GFX1232-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1232-NEXT:    s_lshl_b32 s3, 0xff, s2
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX1232-NEXT:    s_not_b32 s11, s3
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_lshl_b32 s12, s6, s2
-; GFX1232-NEXT:    s_mov_b32 s6, -1
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT:  .LBB13_2: ; %atomicrmw.start
-; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_add_nc_u32_e32 v0, s12, v1
-; GFX1232-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    s_or_b32 s10, vcc_lo, s10
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
-; GFX1232-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX1232-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-NEXT:  .LBB13_4: ; %Flow
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT:    s_wait_alu 0xf1ff
-; GFX1232-NEXT:    v_mad_u16 v0, s8, v4, s2
-; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    buffer_store_b8 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_endpgm
+; GFX1164-TRUE16-LABEL: uniform_add_i8:
+; GFX1164-TRUE16:       ; %bb.0:
+; GFX1164-TRUE16-NEXT:    s_clause 0x1
+; GFX1164-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-TRUE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1164-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1164-TRUE16-NEXT:  ; %bb.1:
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-TRUE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1164-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, s10
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1164-TRUE16-NEXT:    s_mul_i32 s7, s7, s6
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s12, 0xff, s11
+; GFX1164-TRUE16-NEXT:    s_and_b32 s2, s7, 0xff
+; GFX1164-TRUE16-NEXT:    s_not_b32 s13, s12
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1164-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1164-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-TRUE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    s_endpgm
+;
+; GFX1164-FAKE16-LABEL: uniform_add_i8:
+; GFX1164-FAKE16:       ; %bb.0:
+; GFX1164-FAKE16-NEXT:    s_clause 0x1
+; GFX1164-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-FAKE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1164-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1164-FAKE16-NEXT:  ; %bb.1:
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1164-FAKE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1164-FAKE16-NEXT:    s_mul_i32 s2, s10, s6
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s12, 0xff, s11
+; GFX1164-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX1164-FAKE16-NEXT:    s_not_b32 s13, s12
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1164-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1164-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-FAKE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1164-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132-TRUE16-LABEL: uniform_add_i8:
+; GFX1132-TRUE16:       ; %bb.0:
+; GFX1132-TRUE16-NEXT:    s_clause 0x1
+; GFX1132-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, 0
+; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1132-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1132-TRUE16-NEXT:  ; %bb.1:
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1132-TRUE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s11, s8
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_mul_i32 s6, s11, s6
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s3, 0xff, s2
+; GFX1132-TRUE16-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX1132-TRUE16-NEXT:    s_not_b32 s11, s3
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1132-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132-FAKE16-LABEL: uniform_add_i8:
+; GFX1132-FAKE16:       ; %bb.0:
+; GFX1132-FAKE16-NEXT:    s_clause 0x1
+; GFX1132-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, 0
+; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1132-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1132-FAKE16-NEXT:  ; %bb.1:
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1132-FAKE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_mul_i32 s6, s8, s6
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s3, 0xff, s2
+; GFX1132-FAKE16-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX1132-FAKE16-NEXT:    s_not_b32 s11, s3
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1132-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX1132-FAKE16-NEXT:    s_endpgm
+;
+; GFX1264-TRUE16-LABEL: uniform_add_i8:
+; GFX1264-TRUE16:       ; %bb.0:
+; GFX1264-TRUE16-NEXT:    s_clause 0x1
+; GFX1264-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-TRUE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1264-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1264-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1264-TRUE16-NEXT:  ; %bb.1:
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-TRUE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, s10
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-TRUE16-NEXT:    s_mul_i32 s7, s7, s6
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s12, 0xff, s11
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-TRUE16-NEXT:    s_and_b32 s2, s7, 0xff
+; GFX1264-TRUE16-NEXT:    s_not_b32 s13, s12
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1264-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-TRUE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    s_endpgm
+;
+; GFX1264-FAKE16-LABEL: uniform_add_i8:
+; GFX1264-FAKE16:       ; %bb.0:
+; GFX1264-FAKE16-NEXT:    s_clause 0x1
+; GFX1264-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-FAKE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1264-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1264-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1264-FAKE16-NEXT:  ; %bb.1:
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1264-FAKE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-FAKE16-NEXT:    s_mul_i32 s2, s10, s6
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s12, 0xff, s11
+; GFX1264-FAKE16-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX1264-FAKE16-NEXT:    s_not_b32 s13, s12
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1264-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-FAKE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1264-FAKE16-NEXT:    s_endpgm
+;
+; GFX1232-TRUE16-LABEL: uniform_add_i8:
+; GFX1232-TRUE16:       ; %bb.0:
+; GFX1232-TRUE16-NEXT:    s_clause 0x1
+; GFX1232-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, 0
+; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1232-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1232-TRUE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1232-TRUE16-NEXT:  ; %bb.1:
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1232-TRUE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s11, s8
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_mul_i32 s6, s11, s6
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s3, 0xff, s2
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX1232-TRUE16-NEXT:    s_not_b32 s11, s3
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-TRUE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1232-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    s_endpgm
+;
+; GFX1232-FAKE16-LABEL: uniform_add_i8:
+; GFX1232-FAKE16:       ; %bb.0:
+; GFX1232-FAKE16-NEXT:    s_clause 0x1
+; GFX1232-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, 0
+; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1232-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1232-FAKE16-NEXT:    s_cbranch_execz .LBB13_4
+; GFX1232-FAKE16-NEXT:  ; %bb.1:
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1232-FAKE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_mul_i32 s6, s8, s6
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s3, 0xff, s2
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX1232-FAKE16-NEXT:    s_not_b32 s11, s3
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-FAKE16-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX1232-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT:  .LBB13_4: ; %Flow
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
   store i8 %rmw, ptr addrspace(1) %result
   ret void
@@ -8563,163 +8995,325 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: uniform_or_i16:
-; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1164-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_and_b32 s7, s2, 3
-; GFX1164-NEXT:    s_and_b32 s8, 0xffff, s6
-; GFX1164-NEXT:    s_lshl_b32 s7, s7, 3
-; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT:    s_lshl_b32 s9, s8, s7
-; GFX1164-NEXT:    s_and_b32 s8, s2, -4
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s9
-; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    s_mov_b32 s9, s3
-; GFX1164-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
-; GFX1164-NEXT:  .LBB15_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_endpgm
-;
-; GFX1132-LABEL: uniform_or_i16:
-; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1132-NEXT:    s_load_b32 s4, s[4:5], 0x34
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_and_b32 s6, s2, 3
-; GFX1132-NEXT:    s_and_b32 s7, 0xffff, s4
-; GFX1132-NEXT:    s_lshl_b32 s6, s6, 3
-; GFX1132-NEXT:    s_and_b32 s8, s2, -4
-; GFX1132-NEXT:    s_lshl_b32 s7, s7, s6
-; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    s_mov_b32 s9, s3
-; GFX1132-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
-; GFX1132-NEXT:  .LBB15_2:
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_endpgm
-;
-; GFX1264-LABEL: uniform_or_i16:
-; GFX1264:       ; %bb.0:
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1264-NEXT:    ; implicit-def: $vgpr0
-; GFX1264-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1264-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_and_b32 s7, s2, 3
-; GFX1264-NEXT:    s_and_b32 s8, 0xffff, s6
-; GFX1264-NEXT:    s_lshl_b32 s7, s7, 3
-; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT:    s_lshl_b32 s9, s8, s7
-; GFX1264-NEXT:    s_and_b32 s8, s2, -4
-; GFX1264-NEXT:    v_mov_b32_e32 v0, s9
-; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    s_mov_b32 s9, s3
-; GFX1264-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
-; GFX1264-NEXT:  .LBB15_2:
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_endpgm
-;
-; GFX1232-LABEL: uniform_or_i16:
-; GFX1232:       ; %bb.0:
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_load_b32 s4, s[4:5], 0x34
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1232-NEXT:    ; implicit-def: $vgpr0
-; GFX1232-NEXT:    s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_and_b32 s6, s2, 3
-; GFX1232-NEXT:    s_and_b32 s7, 0xffff, s4
-; GFX1232-NEXT:    s_lshl_b32 s6, s6, 3
-; GFX1232-NEXT:    s_and_b32 s8, s2, -4
-; GFX1232-NEXT:    s_lshl_b32 s7, s7, s6
-; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    s_mov_b32 s9, s3
-; GFX1232-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
-; GFX1232-NEXT:  .LBB15_2:
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_endpgm
+; GFX1164-TRUE16-LABEL: uniform_or_i16:
+; GFX1164-TRUE16:       ; %bb.0:
+; GFX1164-TRUE16-NEXT:    s_clause 0x1
+; GFX1164-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1164-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1164-TRUE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-TRUE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1164-TRUE16-NEXT:  ; %bb.1:
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s8, s6
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1164-TRUE16-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1164-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1164-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1164-TRUE16-NEXT:  .LBB15_2:
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-TRUE16-NEXT:    v_cndmask_b16 v0.l, s6, 0, vcc
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    s_endpgm
+;
+; GFX1164-FAKE16-LABEL: uniform_or_i16:
+; GFX1164-FAKE16:       ; %bb.0:
+; GFX1164-FAKE16-NEXT:    s_clause 0x1
+; GFX1164-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1164-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1164-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1164-FAKE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-FAKE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1164-FAKE16-NEXT:  ; %bb.1:
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1164-FAKE16-NEXT:    s_and_b32 s8, 0xffff, s6
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1164-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1164-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1164-FAKE16-NEXT:  .LBB15_2:
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132-TRUE16-LABEL: uniform_or_i16:
+; GFX1132-TRUE16:       ; %bb.0:
+; GFX1132-TRUE16-NEXT:    s_clause 0x1
+; GFX1132-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1132-TRUE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1132-TRUE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1132-TRUE16-NEXT:  ; %bb.1:
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, s4
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1132-TRUE16-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX1132-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1132-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1132-TRUE16-NEXT:  .LBB15_2:
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, 0, vcc_lo
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132-FAKE16-LABEL: uniform_or_i16:
+; GFX1132-FAKE16:       ; %bb.0:
+; GFX1132-FAKE16-NEXT:    s_clause 0x1
+; GFX1132-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-FAKE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1132-FAKE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1132-FAKE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1132-FAKE16-NEXT:  ; %bb.1:
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s7, 0xffff, s4
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1132-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], 0 glc
+; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1132-FAKE16-NEXT:  .LBB15_2:
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-FAKE16-NEXT:    s_endpgm
+;
+; GFX1264-TRUE16-LABEL: uniform_or_i16:
+; GFX1264-TRUE16:       ; %bb.0:
+; GFX1264-TRUE16-NEXT:    s_clause 0x1
+; GFX1264-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1264-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1264-TRUE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-TRUE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1264-TRUE16-NEXT:  ; %bb.1:
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s8, s6
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1264-TRUE16-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1264-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1264-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1264-TRUE16-NEXT:  .LBB15_2:
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-TRUE16-NEXT:    v_cndmask_b16 v0.l, s6, 0, vcc
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    s_endpgm
+;
+; GFX1264-FAKE16-LABEL: uniform_or_i16:
+; GFX1264-FAKE16:       ; %bb.0:
+; GFX1264-FAKE16-NEXT:    s_clause 0x1
+; GFX1264-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1264-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1264-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1264-FAKE16-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-FAKE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1264-FAKE16-NEXT:  ; %bb.1:
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_and_b32 s7, s2, 3
+; GFX1264-FAKE16-NEXT:    s_and_b32 s8, 0xffff, s6
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s9, s8, s7
+; GFX1264-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v0, s9
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1264-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s7, v0
+; GFX1264-FAKE16-NEXT:  .LBB15_2:
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s6, 0, vcc
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-FAKE16-NEXT:    s_endpgm
+;
+; GFX1232-TRUE16-LABEL: uniform_or_i16:
+; GFX1232-TRUE16:       ; %bb.0:
+; GFX1232-TRUE16-NEXT:    s_clause 0x1
+; GFX1232-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1232-TRUE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1232-TRUE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1232-TRUE16-NEXT:  ; %bb.1:
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, s4
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1232-TRUE16-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX1232-TRUE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX1232-TRUE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1232-TRUE16-NEXT:  .LBB15_2:
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, 0, vcc_lo
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_or_b16 v0.l, s2, v0.l
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    s_endpgm
+;
+; GFX1232-FAKE16-LABEL: uniform_or_i16:
+; GFX1232-FAKE16:       ; %bb.0:
+; GFX1232-FAKE16-NEXT:    s_clause 0x1
+; GFX1232-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-FAKE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1232-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1232-FAKE16-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1232-FAKE16-NEXT:    s_cbranch_execz .LBB15_2
+; GFX1232-FAKE16-NEXT:  ; %bb.1:
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_and_b32 s6, s2, 3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s7, 0xffff, s4
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s6, s6, 3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s8, s2, -4
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s7, s7, s6
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX1232-FAKE16-NEXT:    buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s6, v0
+; GFX1232-FAKE16-NEXT:  .LBB15_2:
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, s4, 0, vcc_lo
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
   store i16 %rmw, ptr addrspace(1) %result
   ret void
@@ -9019,262 +9613,524 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: uniform_add_i16:
-; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1164-NEXT:    s_load_b32 s10, s[4:5], 0x34
-; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1164-NEXT:    s_cbranch_execz .LBB16_4
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_and_b32 s4, s2, -4
-; GFX1164-NEXT:    s_mov_b32 s5, s3
-; GFX1164-NEXT:    s_and_b32 s2, s2, 3
-; GFX1164-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1164-NEXT:    s_lshl_b32 s11, s2, 3
-; GFX1164-NEXT:    s_mul_i32 s2, s10, s6
-; GFX1164-NEXT:    s_lshl_b32 s12, 0xffff, s11
-; GFX1164-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX1164-NEXT:    s_not_b32 s13, s12
-; GFX1164-NEXT:    s_lshl_b32 s14, s2, s11
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB16_2: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s14, v1
-; GFX1164-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1164-NEXT:    v_and_or_b32 v0, v1, s13, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB16_2
-; GFX1164-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
-; GFX1164-NEXT:  .LBB16_4: ; %Flow
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u16 v0, s10, v4, s2
-; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_endpgm
-;
-; GFX1132-LABEL: uniform_add_i16:
-; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1132-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1132-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1132-NEXT:    s_mov_b32 s10, 0
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
-; GFX1132-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1132-NEXT:    s_cbranch_execz .LBB16_4
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_and_b32 s4, s2, -4
-; GFX1132-NEXT:    s_mov_b32 s5, s3
-; GFX1132-NEXT:    s_and_b32 s2, s2, 3
-; GFX1132-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1132-NEXT:    s_bcnt1_i32_b32 s6, s6
-; GFX1132-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1132-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1132-NEXT:    s_lshl_b32 s3, 0xffff, s2
-; GFX1132-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX1132-NEXT:    s_not_b32 s11, s3
-; GFX1132-NEXT:    s_lshl_b32 s12, s6, s2
-; GFX1132-NEXT:    s_mov_b32 s6, -1
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1132-NEXT:  .LBB16_2: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s12, v1
-; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    s_or_b32 s10, vcc_lo, s10
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
-; GFX1132-NEXT:    s_cbranch_execnz .LBB16_2
-; GFX1132-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-NEXT:  .LBB16_4: ; %Flow
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u16 v0, s8, v4, s2
-; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_endpgm
-;
-; GFX1264-LABEL: uniform_add_i16:
-; GFX1264:       ; %bb.0:
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-NEXT:    s_load_b32 s10, s[4:5], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX1264-NEXT:    ; implicit-def: $vgpr0
-; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1264-NEXT:    s_cbranch_execz .LBB16_4
-; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_and_b32 s4, s2, -4
-; GFX1264-NEXT:    s_mov_b32 s5, s3
-; GFX1264-NEXT:    s_and_b32 s2, s2, 3
-; GFX1264-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX1264-NEXT:    s_lshl_b32 s11, s2, 3
-; GFX1264-NEXT:    s_wait_alu 0xfffe
-; GFX1264-NEXT:    s_mul_i32 s2, s10, s6
-; GFX1264-NEXT:    s_lshl_b32 s12, 0xffff, s11
-; GFX1264-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX1264-NEXT:    s_not_b32 s13, s12
-; GFX1264-NEXT:    s_lshl_b32 s14, s2, s11
-; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s6, -1
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1264-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1264-NEXT:  .LBB16_2: ; %atomicrmw.start
-; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_add_nc_u32_e32 v0, s14, v1
-; GFX1264-NEXT:    v_and_b32_e32 v0, s12, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT:    v_and_or_b32 v0, v1, s13, v0
-; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1264-NEXT:    s_wait_alu 0xfffe
-; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    s_cbranch_execnz .LBB16_2
-; GFX1264-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
-; GFX1264-NEXT:  .LBB16_4: ; %Flow
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT:    s_wait_alu 0xf1ff
-; GFX1264-NEXT:    v_mad_u16 v0, s10, v4, s2
-; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_endpgm
-;
-; GFX1232-LABEL: uniform_add_i16:
-; GFX1232:       ; %bb.0:
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1232-NEXT:    s_mov_b32 s10, 0
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
-; GFX1232-NEXT:    s_mov_b32 s9, exec_lo
-; GFX1232-NEXT:    ; implicit-def: $vgpr0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1232-NEXT:    s_cbranch_execz .LBB16_4
-; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_and_b32 s4, s2, -4
-; GFX1232-NEXT:    s_mov_b32 s5, s3
-; GFX1232-NEXT:    s_and_b32 s2, s2, 3
-; GFX1232-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1232-NEXT:    s_bcnt1_i32_b32 s6, s6
-; GFX1232-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1232-NEXT:    s_lshl_b32 s3, 0xffff, s2
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX1232-NEXT:    s_not_b32 s11, s3
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_lshl_b32 s12, s6, s2
-; GFX1232-NEXT:    s_mov_b32 s6, -1
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT:  .LBB16_2: ; %atomicrmw.start
-; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_add_nc_u32_e32 v0, s12, v1
-; GFX1232-NEXT:    v_and_b32_e32 v0, s3, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_or_b32 v0, v1, s11, v0
-; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    s_or_b32 s10, vcc_lo, s10
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
-; GFX1232-NEXT:    s_cbranch_execnz .LBB16_2
-; GFX1232-NEXT:  ; %bb.3: ; %atomicrmw.end
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s10
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-NEXT:  .LBB16_4: ; %Flow
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT:    s_wait_alu 0xf1ff
-; GFX1232-NEXT:    v_mad_u16 v0, s8, v4, s2
-; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_endpgm
+; GFX1164-TRUE16-LABEL: uniform_add_i16:
+; GFX1164-TRUE16:       ; %bb.0:
+; GFX1164-TRUE16-NEXT:    s_clause 0x1
+; GFX1164-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-TRUE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1164-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1164-TRUE16-NEXT:  ; %bb.1:
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-TRUE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1164-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, s10
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1164-TRUE16-NEXT:    s_mul_i32 s7, s7, s6
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s12, 0xffff, s11
+; GFX1164-TRUE16-NEXT:    s_and_b32 s2, s7, 0xffff
+; GFX1164-TRUE16-NEXT:    s_not_b32 s13, s12
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1164-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1164-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-TRUE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    s_endpgm
+;
+; GFX1164-FAKE16-LABEL: uniform_add_i16:
+; GFX1164-FAKE16:       ; %bb.0:
+; GFX1164-FAKE16-NEXT:    s_clause 0x1
+; GFX1164-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-FAKE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1164-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1164-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1164-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1164-FAKE16-NEXT:  ; %bb.1:
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1164-FAKE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1164-FAKE16-NEXT:    s_mul_i32 s2, s10, s6
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s12, 0xffff, s11
+; GFX1164-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1164-FAKE16-NEXT:    s_not_b32 s13, s12
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1164-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1164-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1164-FAKE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132-TRUE16-LABEL: uniform_add_i16:
+; GFX1132-TRUE16:       ; %bb.0:
+; GFX1132-TRUE16-NEXT:    s_clause 0x1
+; GFX1132-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s10, 0
+; GFX1132-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1132-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1132-TRUE16-NEXT:  ; %bb.1:
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1132-TRUE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s11, s8
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_mul_i32 s6, s11, s6
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1132-TRUE16-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX1132-TRUE16-NEXT:    s_not_b32 s11, s3
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1132-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132-FAKE16-LABEL: uniform_add_i16:
+; GFX1132-FAKE16:       ; %bb.0:
+; GFX1132-FAKE16-NEXT:    s_clause 0x1
+; GFX1132-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, 0
+; GFX1132-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1132-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1132-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1132-FAKE16-NEXT:  ; %bb.1:
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1132-FAKE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_mul_i32 s6, s8, s6
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1132-FAKE16-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX1132-FAKE16-NEXT:    s_not_b32 s11, s3
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1132-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-FAKE16-NEXT:    s_endpgm
+;
+; GFX1264-TRUE16-LABEL: uniform_add_i16:
+; GFX1264-TRUE16:       ; %bb.0:
+; GFX1264-TRUE16-NEXT:    s_clause 0x1
+; GFX1264-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-TRUE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1264-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1264-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1264-TRUE16-NEXT:  ; %bb.1:
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-TRUE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, s10
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-TRUE16-NEXT:    s_mul_i32 s7, s7, s6
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s12, 0xffff, s11
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-TRUE16-NEXT:    s_and_b32 s2, s7, 0xffff
+; GFX1264-TRUE16-NEXT:    s_not_b32 s13, s12
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1264-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-TRUE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1264-TRUE16-NEXT:    v_mad_u16 v0.l, s10, v4.l, s2
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    s_endpgm
+;
+; GFX1264-FAKE16-LABEL: uniform_add_i16:
+; GFX1264-FAKE16:       ; %bb.0:
+; GFX1264-FAKE16-NEXT:    s_clause 0x1
+; GFX1264-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-FAKE16-NEXT:    s_load_b32 s10, s[4:5], 0x34
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[6:7], exec
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX1264-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1264-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1264-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1264-FAKE16-NEXT:  ; %bb.1:
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1264-FAKE16-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-FAKE16-NEXT:    s_mul_i32 s2, s10, s6
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s12, 0xffff, s11
+; GFX1264-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1264-FAKE16-NEXT:    s_not_b32 s13, s12
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s14, v1
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1264-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX1264-FAKE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1264-FAKE16-NEXT:    v_mad_u16 v0, s10, v4, s2
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-FAKE16-NEXT:    s_endpgm
+;
+; GFX1232-TRUE16-LABEL: uniform_add_i16:
+; GFX1232-TRUE16:       ; %bb.0:
+; GFX1232-TRUE16-NEXT:    s_clause 0x1
+; GFX1232-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s10, 0
+; GFX1232-TRUE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1232-TRUE16-NEXT:    ; implicit-def: $vgpr0_lo16
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1232-TRUE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1232-TRUE16-NEXT:  ; %bb.1:
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1232-TRUE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s11, s8
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_mul_i32 s6, s11, s6
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX1232-TRUE16-NEXT:    s_not_b32 s11, s3
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-TRUE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1232-TRUE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1232-TRUE16-NEXT:    v_mad_u16 v0.l, s8, v4.l, s2
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    s_endpgm
+;
+; GFX1232-FAKE16-LABEL: uniform_add_i16:
+; GFX1232-FAKE16:       ; %bb.0:
+; GFX1232-FAKE16-NEXT:    s_clause 0x1
+; GFX1232-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, 0
+; GFX1232-FAKE16-NEXT:    v_mbcnt_lo_u32_b32 v4, s6, 0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s9, exec_lo
+; GFX1232-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v4
+; GFX1232-FAKE16-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1232-FAKE16-NEXT:  ; %bb.1:
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1232-FAKE16-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_mul_i32 s6, s8, s6
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX1232-FAKE16-NEXT:    s_not_b32 s11, s3
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s12, s6, s2
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-FAKE16-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_add_nc_u32_e32 v0, s12, v1
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s11, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT:    s_or_b32 s10, vcc_lo, s10
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s10
+; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX1232-FAKE16-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s10
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT:  .LBB16_4: ; %Flow
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1232-FAKE16-NEXT:    v_mad_u16 v0, s8, v4, s2
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
   store i16 %rmw, ptr addrspace(1) %result
   ret void
@@ -9863,191 +10719,377 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: uniform_fadd_f16:
-; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1164-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_and_b32 s4, s2, -4
-; GFX1164-NEXT:    s_mov_b32 s5, s3
-; GFX1164-NEXT:    s_and_b32 s2, s2, 3
-; GFX1164-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX1164-NEXT:    s_lshl_b32 s9, s2, 3
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_lshl_b32 s2, 0xffff, s9
-; GFX1164-NEXT:    s_not_b32 s10, s2
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
-; GFX1164-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1164-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_and_or_b32 v0, v1, s10, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
-; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_endpgm
-;
-; GFX1132-LABEL: uniform_fadd_f16:
-; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1132-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1132-NEXT:    s_mov_b32 s9, 0
-; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_and_b32 s4, s2, -4
-; GFX1132-NEXT:    s_mov_b32 s5, s3
-; GFX1132-NEXT:    s_and_b32 s2, s2, 3
-; GFX1132-NEXT:    s_load_b32 s6, s[4:5], 0x0
-; GFX1132-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_lshl_b32 s3, 0xffff, s2
-; GFX1132-NEXT:    s_not_b32 s3, s3
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1132-NEXT:    s_mov_b32 s6, -1
-; GFX1132-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
-; GFX1132-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1132-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_or_b32 v0, v1, s3, v0
-; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    s_or_b32 s9, vcc_lo, s9
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_endpgm
-;
-; GFX1264-LABEL: uniform_fadd_f16:
-; GFX1264:       ; %bb.0:
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s6, -1
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_and_b32 s4, s2, -4
-; GFX1264-NEXT:    s_mov_b32 s5, s3
-; GFX1264-NEXT:    s_and_b32 s2, s2, 3
-; GFX1264-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX1264-NEXT:    s_lshl_b32 s9, s2, 3
-; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1264-NEXT:    s_lshl_b32 s2, 0xffff, s9
-; GFX1264-NEXT:    s_not_b32 s10, s2
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1264-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1264-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
-; GFX1264-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1264-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT:    v_and_or_b32 v0, v1, s10, v0
-; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1264-NEXT:    s_wait_alu 0xfffe
-; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX1264-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
-; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_endpgm
-;
-; GFX1232-LABEL: uniform_fadd_f16:
-; GFX1232:       ; %bb.0:
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_load_b32 s8, s[4:5], 0x34
-; GFX1232-NEXT:    s_mov_b32 s9, 0
-; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_and_b32 s4, s2, -4
-; GFX1232-NEXT:    s_mov_b32 s5, s3
-; GFX1232-NEXT:    s_and_b32 s2, s2, 3
-; GFX1232-NEXT:    s_load_b32 s6, s[4:5], 0x0
-; GFX1232-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1232-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1232-NEXT:    s_lshl_b32 s3, 0xffff, s2
-; GFX1232-NEXT:    s_not_b32 s3, s3
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1232-NEXT:    s_mov_b32 s6, -1
-; GFX1232-NEXT:  .LBB18_1: ; %atomicrmw.start
-; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
-; GFX1232-NEXT:    v_add_f16_e32 v0, s8, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1232-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_or_b32 v0, v1, s3, v0
-; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    s_or_b32 s9, vcc_lo, s9
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT:    s_cbranch_execnz .LBB18_1
-; GFX1232-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s9
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_endpgm
+; GFX1164-TRUE16-LABEL: uniform_fadd_f16:
+; GFX1164-TRUE16:       ; %bb.0:
+; GFX1164-TRUE16-NEXT:    s_clause 0x1
+; GFX1164-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s9, s2, 3
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
+; GFX1164-TRUE16-NEXT:    s_not_b32 s10, s2
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
+; GFX1164-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1164-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    s_endpgm
+;
+; GFX1164-FAKE16-LABEL: uniform_fadd_f16:
+; GFX1164-FAKE16:       ; %bb.0:
+; GFX1164-FAKE16-NEXT:    s_clause 0x1
+; GFX1164-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s9, s2, 3
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
+; GFX1164-FAKE16-NEXT:    s_not_b32 s10, s2
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
+; GFX1164-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1164-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132-TRUE16-LABEL: uniform_fadd_f16:
+; GFX1132-TRUE16:       ; %bb.0:
+; GFX1132-TRUE16-NEXT:    s_clause 0x1
+; GFX1132-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s9, 0
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x0
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1132-TRUE16-NEXT:    s_not_b32 s3, s3
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1132-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
+; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1132-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132-FAKE16-LABEL: uniform_fadd_f16:
+; GFX1132-FAKE16:       ; %bb.0:
+; GFX1132-FAKE16-NEXT:    s_clause 0x1
+; GFX1132-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s9, 0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x0
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1132-FAKE16-NEXT:    s_not_b32 s3, s3
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1132-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
+; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1132-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-FAKE16-NEXT:    s_endpgm
+;
+; GFX1264-TRUE16-LABEL: uniform_fadd_f16:
+; GFX1264-TRUE16:       ; %bb.0:
+; GFX1264-TRUE16-NEXT:    s_clause 0x1
+; GFX1264-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s9, s2, 3
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
+; GFX1264-TRUE16-NEXT:    s_not_b32 s10, s2
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
+; GFX1264-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1264-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    s_endpgm
+;
+; GFX1264-FAKE16-LABEL: uniform_fadd_f16:
+; GFX1264-FAKE16:       ; %bb.0:
+; GFX1264-FAKE16-NEXT:    s_clause 0x1
+; GFX1264-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s9, s2, 3
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s2, 0xffff, s9
+; GFX1264-FAKE16-NEXT:    s_not_b32 s10, s2
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v1
+; GFX1264-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s9, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s10, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1264-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s9, v2
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-FAKE16-NEXT:    s_endpgm
+;
+; GFX1232-TRUE16-LABEL: uniform_fadd_f16:
+; GFX1232-TRUE16:       ; %bb.0:
+; GFX1232-TRUE16-NEXT:    s_clause 0x1
+; GFX1232-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-TRUE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s9, 0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x0
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1232-TRUE16-NEXT:    s_not_b32 s3, s3
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-TRUE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1232-TRUE16-NEXT:    v_add_f16_e32 v0.l, s8, v0.l
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
+; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1232-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    s_endpgm
+;
+; GFX1232-FAKE16-LABEL: uniform_fadd_f16:
+; GFX1232-FAKE16:       ; %bb.0:
+; GFX1232-FAKE16-NEXT:    s_clause 0x1
+; GFX1232-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-FAKE16-NEXT:    s_load_b32 s8, s[4:5], 0x34
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s9, 0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x0
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1232-FAKE16-NEXT:    s_not_b32 s3, s3
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-FAKE16-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1232-FAKE16-NEXT:    v_add_f16_e32 v0, s8, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT:    s_or_b32 s9, vcc_lo, s9
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s9
+; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX1232-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, half %val monotonic, align 2
   store half %rmw, ptr addrspace(1) %result
   ret void
@@ -10291,227 +11333,453 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX1032-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: uniform_fadd_bf16:
-; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1164-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_and_b32 s4, s2, -4
-; GFX1164-NEXT:    s_mov_b32 s5, s3
-; GFX1164-NEXT:    s_and_b32 s2, s2, 3
-; GFX1164-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1164-NEXT:    s_lshl_b32 s8, s2, 3
-; GFX1164-NEXT:    s_lshl_b32 s10, s6, 16
-; GFX1164-NEXT:    s_lshl_b32 s2, 0xffff, s8
-; GFX1164-NEXT:    s_mov_b32 s6, -1
-; GFX1164-NEXT:    s_not_b32 s9, s2
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    .p2align 6
-; GFX1164-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
-; GFX1164-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1164-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX1164-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX1164-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1164-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1164-NEXT:    v_lshlrev_b32_e32 v0, s8, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_and_or_b32 v0, v1, s9, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
-; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s2, -1
-; GFX1164-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1164-NEXT:    s_endpgm
-;
-; GFX1132-LABEL: uniform_fadd_bf16:
-; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1132-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_and_b32 s4, s2, -4
-; GFX1132-NEXT:    s_mov_b32 s5, s3
-; GFX1132-NEXT:    s_and_b32 s2, s2, 3
-; GFX1132-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1132-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1132-NEXT:    s_lshl_b32 s9, s6, 16
-; GFX1132-NEXT:    s_lshl_b32 s3, 0xffff, s2
-; GFX1132-NEXT:    s_mov_b32 s6, -1
-; GFX1132-NEXT:    s_not_b32 s8, s3
-; GFX1132-NEXT:    s_mov_b32 s3, 0
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1132-NEXT:    .p2align 6
-; GFX1132-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
-; GFX1132-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_f32_e32 v0, s9, v0
-; GFX1132-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX1132-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX1132-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1132-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1132-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    s_or_b32 s3, vcc_lo, s3
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT:    s_mov_b32 s2, -1
-; GFX1132-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX1132-NEXT:    s_endpgm
-;
-; GFX1264-LABEL: uniform_fadd_bf16:
-; GFX1264:       ; %bb.0:
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1264-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_and_b32 s4, s2, -4
-; GFX1264-NEXT:    s_mov_b32 s5, s3
-; GFX1264-NEXT:    s_and_b32 s2, s2, 3
-; GFX1264-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1264-NEXT:    s_lshl_b32 s8, s2, 3
-; GFX1264-NEXT:    s_lshl_b32 s10, s6, 16
-; GFX1264-NEXT:    s_lshl_b32 s2, 0xffff, s8
-; GFX1264-NEXT:    s_mov_b32 s6, -1
-; GFX1264-NEXT:    s_not_b32 s9, s2
-; GFX1264-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
-; GFX1264-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_add_f32_e32 v0, s10, v0
-; GFX1264-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX1264-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX1264-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1264-NEXT:    s_wait_alu 0xfffd
-; GFX1264-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1264-NEXT:    v_lshlrev_b32_e32 v0, s8, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT:    v_and_or_b32 v0, v1, s9, v0
-; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX1264-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
-; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s2, -1
-; GFX1264-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1264-NEXT:    s_endpgm
-;
-; GFX1232-LABEL: uniform_fadd_bf16:
-; GFX1232:       ; %bb.0:
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_and_b32 s4, s2, -4
-; GFX1232-NEXT:    s_mov_b32 s5, s3
-; GFX1232-NEXT:    s_and_b32 s2, s2, 3
-; GFX1232-NEXT:    s_load_b32 s7, s[4:5], 0x0
-; GFX1232-NEXT:    s_lshl_b32 s2, s2, 3
-; GFX1232-NEXT:    s_lshl_b32 s9, s6, 16
-; GFX1232-NEXT:    s_lshl_b32 s3, 0xffff, s2
-; GFX1232-NEXT:    s_mov_b32 s6, -1
-; GFX1232-NEXT:    s_not_b32 s8, s3
-; GFX1232-NEXT:    s_mov_b32 s3, 0
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v1, s7
-; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT:  .LBB19_1: ; %atomicrmw.start
-; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
-; GFX1232-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_add_f32_e32 v0, s9, v0
-; GFX1232-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX1232-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX1232-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX1232-NEXT:    s_wait_alu 0xfffd
-; GFX1232-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX1232-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_and_or_b32 v0, v1, s8, v0
-; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    s_or_b32 s3, vcc_lo, s3
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
-; GFX1232-NEXT:    s_cbranch_execnz .LBB19_1
-; GFX1232-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX1232-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
-; GFX1232-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT:    s_mov_b32 s2, -1
-; GFX1232-NEXT:    buffer_store_b16 v0, off, s[0:3], null
-; GFX1232-NEXT:    s_endpgm
+; GFX1164-TRUE16-LABEL: uniform_fadd_bf16:
+; GFX1164-TRUE16:       ; %bb.0:
+; GFX1164-TRUE16-NEXT:    s_clause 0x1
+; GFX1164-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s8, s2, 3
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s10, s6, 16
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s2, 0xffff, s8
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-TRUE16-NEXT:    s_not_b32 s9, s2
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-TRUE16-NEXT:    .p2align 6
+; GFX1164-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
+; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
+; GFX1164-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1164-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1164-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v2
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1164-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    s_endpgm
+;
+; GFX1164-FAKE16-LABEL: uniform_fadd_bf16:
+; GFX1164-FAKE16:       ; %bb.0:
+; GFX1164-FAKE16-NEXT:    s_clause 0x1
+; GFX1164-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1164-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s8, s2, 3
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s10, s6, 16
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s2, 0xffff, s8
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-FAKE16-NEXT:    s_not_b32 s9, s2
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-FAKE16-NEXT:    .p2align 6
+; GFX1164-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
+; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_add_f32_e32 v0, s10, v0
+; GFX1164-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1164-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1164-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1164-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132-TRUE16-LABEL: uniform_fadd_bf16:
+; GFX1132-TRUE16:       ; %bb.0:
+; GFX1132-TRUE16-NEXT:    s_clause 0x1
+; GFX1132-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s9, s6, 16
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-TRUE16-NEXT:    s_not_b32 s8, s3
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-TRUE16-NEXT:    .p2align 6
+; GFX1132-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_add_f32_e32 v0, s9, v0
+; GFX1132-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1132-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1132-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1132-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132-FAKE16-LABEL: uniform_fadd_bf16:
+; GFX1132-FAKE16:       ; %bb.0:
+; GFX1132-FAKE16-NEXT:    s_clause 0x1
+; GFX1132-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s9, s6, 16
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-FAKE16-NEXT:    s_not_b32 s8, s3
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-FAKE16-NEXT:    .p2align 6
+; GFX1132-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_add_f32_e32 v0, s9, v0
+; GFX1132-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1132-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1132-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX1132-FAKE16-NEXT:    s_endpgm
+;
+; GFX1264-TRUE16-LABEL: uniform_fadd_bf16:
+; GFX1264-TRUE16:       ; %bb.0:
+; GFX1264-TRUE16-NEXT:    s_clause 0x1
+; GFX1264-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s8, s2, 3
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s10, s6, 16
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s2, 0xffff, s8
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-TRUE16-NEXT:    s_not_b32 s9, s2
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
+; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
+; GFX1264-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1264-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1264-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v2
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1264-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    s_endpgm
+;
+; GFX1264-FAKE16-LABEL: uniform_fadd_bf16:
+; GFX1264-FAKE16:       ; %bb.0:
+; GFX1264-FAKE16-NEXT:    s_clause 0x1
+; GFX1264-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1264-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s8, s2, 3
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s10, s6, 16
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s2, 0xffff, s8
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-FAKE16-NEXT:    s_not_b32 s9, s2
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v1
+; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_add_f32_e32 v0, s10, v0
+; GFX1264-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1264-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s8, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1264-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1264-FAKE16-NEXT:    s_endpgm
+;
+; GFX1232-TRUE16-LABEL: uniform_fadd_bf16:
+; GFX1232-TRUE16:       ; %bb.0:
+; GFX1232-TRUE16-NEXT:    s_clause 0x1
+; GFX1232-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-TRUE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s9, s6, 16
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-TRUE16-NEXT:    s_not_b32 s8, s3
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_add_f32_e32 v0, s9, v0
+; GFX1232-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1232-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1232-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1232-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1232-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    s_endpgm
+;
+; GFX1232-FAKE16-LABEL: uniform_fadd_bf16:
+; GFX1232-FAKE16:       ; %bb.0:
+; GFX1232-FAKE16-NEXT:    s_clause 0x1
+; GFX1232-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_and_b32 s4, s2, -4
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-FAKE16-NEXT:    s_and_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_load_b32 s7, s[4:5], 0x0
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s9, s6, 16
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s3, 0xffff, s2
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-FAKE16-NEXT:    s_not_b32 s8, s3
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_add_f32_e32 v0, s9, v0
+; GFX1232-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX1232-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_and_or_b32 v0, v1, s8, v0
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX1232-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1232-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, s2, v2
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, bfloat %val monotonic, align 2
   store bfloat %rmw, ptr addrspace(1) %result
   ret void
@@ -11109,232 +12377,483 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX1032-NEXT:    buffer_store_dword v2, off, s[8:11], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
-; GFX1164-LABEL: uniform_fadd_v2bf16:
-; GFX1164:       ; %bb.0:
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
-; GFX1164-NEXT:    s_load_b32 s0, s[4:5], 0x34
-; GFX1164-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_load_b32 s1, s[10:11], 0x0
-; GFX1164-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX1164-NEXT:    s_and_b32 s13, s0, 0xffff0000
-; GFX1164-NEXT:    s_mov_b32 s4, s10
-; GFX1164-NEXT:    s_mov_b32 s5, s11
-; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1164-NEXT:    .p2align 6
-; GFX1164-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX1164-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1164-NEXT:    v_add_f32_e32 v0, s12, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_add_f32_e32 v2, s13, v2
-; GFX1164-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1164-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX1164-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1164-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX1164-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX1164-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1164-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1164-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1164-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1164-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX1164-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1164-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    buffer_store_b32 v2, off, s[8:11], 0
-; GFX1164-NEXT:    s_endpgm
-;
-; GFX1132-LABEL: uniform_fadd_v2bf16:
-; GFX1132:       ; %bb.0:
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
-; GFX1132-NEXT:    s_load_b32 s0, s[4:5], 0x34
-; GFX1132-NEXT:    s_mov_b32 s1, 0
-; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1132-NEXT:    s_mov_b32 s6, -1
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_load_b32 s4, s[10:11], 0x0
-; GFX1132-NEXT:    s_lshl_b32 s2, s0, 16
-; GFX1132-NEXT:    s_and_b32 s3, s0, 0xffff0000
-; GFX1132-NEXT:    s_mov_b32 s5, s11
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s4
-; GFX1132-NEXT:    s_mov_b32 s4, s10
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX1132-NEXT:    .p2align 6
-; GFX1132-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX1132-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1132-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_add_f32_e32 v2, s3, v2
-; GFX1132-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX1132-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1132-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX1132-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX1132-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1132-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1132-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1132-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
-; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1132-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX1132-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX1132-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1132-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    buffer_store_b32 v2, off, s[8:11], 0
-; GFX1132-NEXT:    s_endpgm
-;
-; GFX1264-LABEL: uniform_fadd_v2bf16:
-; GFX1264:       ; %bb.0:
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
-; GFX1264-NEXT:    s_load_b32 s0, s[4:5], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[2:3], 0
-; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s6, -1
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_load_b32 s1, s[10:11], 0x0
-; GFX1264-NEXT:    s_lshl_b32 s12, s0, 16
-; GFX1264-NEXT:    s_and_b32 s13, s0, 0xffff0000
-; GFX1264-NEXT:    s_mov_b32 s4, s10
-; GFX1264-NEXT:    s_mov_b32 s5, s11
-; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1264-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX1264-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1264-NEXT:    v_add_f32_e32 v0, s12, v0
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-NEXT:    v_add_f32_e32 v2, s13, v2
-; GFX1264-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1264-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX1264-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1264-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX1264-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX1264-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1264-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1264-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
-; GFX1264-NEXT:    s_wait_alu 0xfffd
-; GFX1264-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
-; GFX1264-NEXT:    s_wait_alu 0xf1ff
-; GFX1264-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1264-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1264-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v2, v0
-; GFX1264-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1264-NEXT:    s_wait_loadcnt 0x0
-; GFX1264-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1264-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX1264-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1264-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    buffer_store_b32 v2, off, s[8:11], null
-; GFX1264-NEXT:    s_endpgm
-;
-; GFX1232-LABEL: uniform_fadd_v2bf16:
-; GFX1232:       ; %bb.0:
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
-; GFX1232-NEXT:    s_load_b32 s0, s[4:5], 0x34
-; GFX1232-NEXT:    s_mov_b32 s1, 0
-; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1232-NEXT:    s_mov_b32 s6, -1
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_load_b32 s4, s[10:11], 0x0
-; GFX1232-NEXT:    s_lshl_b32 s2, s0, 16
-; GFX1232-NEXT:    s_and_b32 s3, s0, 0xffff0000
-; GFX1232-NEXT:    s_mov_b32 s5, s11
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    v_mov_b32_e32 v1, s4
-; GFX1232-NEXT:    s_mov_b32 s4, s10
-; GFX1232-NEXT:  .LBB21_1: ; %atomicrmw.start
-; GFX1232-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1232-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX1232-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1232-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-NEXT:    v_add_f32_e32 v2, s3, v2
-; GFX1232-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1232-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX1232-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1232-NEXT:    v_or_b32_e32 v6, 0x400000, v2
-; GFX1232-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX1232-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX1232-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
-; GFX1232-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
-; GFX1232-NEXT:    s_wait_alu 0xfffd
-; GFX1232-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX1232-NEXT:    s_wait_alu 0xf1ff
-; GFX1232-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s0
-; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX1232-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1232-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1232-NEXT:    s_wait_loadcnt 0x0
-; GFX1232-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, v2
-; GFX1232-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX1232-NEXT:    s_cbranch_execnz .LBB21_1
-; GFX1232-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    buffer_store_b32 v2, off, s[8:11], null
-; GFX1232-NEXT:    s_endpgm
+; GFX1164-TRUE16-LABEL: uniform_fadd_v2bf16:
+; GFX1164-TRUE16:       ; %bb.0:
+; GFX1164-TRUE16-NEXT:    s_clause 0x1
+; GFX1164-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1164-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1164-TRUE16-NEXT:    s_mov_b64 s[8:9], 0
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    s_load_b32 s5, s[2:3], 0x0
+; GFX1164-TRUE16-NEXT:    s_and_b32 s10, s4, 0xffff0000
+; GFX1164-TRUE16-NEXT:    s_lshl_b32 s11, s4, 16
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s4, s2
+; GFX1164-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1164-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-TRUE16-NEXT:    .p2align 6
+; GFX1164-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1164-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1164-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1164-TRUE16-NEXT:    v_add_f32_e32 v0, s11, v0
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_add_f32_e32 v2, s10, v2
+; GFX1164-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1164-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1164-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX1164-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1164-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1164-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
+; GFX1164-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1164-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1164-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-TRUE16-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX1164-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[8:9]
+; GFX1164-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1164-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX1164-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
+; GFX1164-TRUE16-NEXT:    s_endpgm
+;
+; GFX1164-FAKE16-LABEL: uniform_fadd_v2bf16:
+; GFX1164-FAKE16:       ; %bb.0:
+; GFX1164-FAKE16-NEXT:    s_clause 0x1
+; GFX1164-FAKE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1164-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x34
+; GFX1164-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    s_load_b32 s1, s[10:11], 0x0
+; GFX1164-FAKE16-NEXT:    s_lshl_b32 s12, s0, 16
+; GFX1164-FAKE16-NEXT:    s_and_b32 s13, s0, 0xffff0000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1164-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1164-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1164-FAKE16-NEXT:    .p2align 6
+; GFX1164-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1164-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1164-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1164-FAKE16-NEXT:    v_add_f32_e32 v0, s12, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_add_f32_e32 v2, s13, v2
+; GFX1164-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1164-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1164-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1164-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1164-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1164-FAKE16-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1164-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1164-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1164-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1164-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1164-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1164-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1164-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1164-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX1164-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1164-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], 0
+; GFX1164-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132-TRUE16-LABEL: uniform_fadd_v2bf16:
+; GFX1132-TRUE16:       ; %bb.0:
+; GFX1132-TRUE16-NEXT:    s_clause 0x1
+; GFX1132-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1132-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    s_load_b32 s5, s[2:3], 0x0
+; GFX1132-TRUE16-NEXT:    s_and_b32 s9, s4, 0xffff0000
+; GFX1132-TRUE16-NEXT:    s_lshl_b32 s10, s4, 16
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s4, s2
+; GFX1132-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1132-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-TRUE16-NEXT:    .p2align 6
+; GFX1132-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1132-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1132-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1132-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT:    v_add_f32_e32 v2, s9, v2
+; GFX1132-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1132-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1132-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1132-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1132-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1132-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1132-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX1132-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1132-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-TRUE16-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX1132-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s8
+; GFX1132-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1132-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX1132-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1132-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
+; GFX1132-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132-FAKE16-LABEL: uniform_fadd_v2bf16:
+; GFX1132-FAKE16:       ; %bb.0:
+; GFX1132-FAKE16-NEXT:    s_clause 0x1
+; GFX1132-FAKE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1132-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x34
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    s_load_b32 s4, s[10:11], 0x0
+; GFX1132-FAKE16-NEXT:    s_lshl_b32 s2, s0, 16
+; GFX1132-FAKE16-NEXT:    s_and_b32 s3, s0, 0xffff0000
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1132-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1132-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX1132-FAKE16-NEXT:    .p2align 6
+; GFX1132-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1132-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1132-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1132-FAKE16-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_add_f32_e32 v2, s3, v2
+; GFX1132-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1132-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX1132-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1132-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1132-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1132-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s0
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1132-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc
+; GFX1132-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1132-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1132-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX1132-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX1132-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1132-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1132-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX1132-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1132-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], 0
+; GFX1132-FAKE16-NEXT:    s_endpgm
+;
+; GFX1264-TRUE16-LABEL: uniform_fadd_v2bf16:
+; GFX1264-TRUE16:       ; %bb.0:
+; GFX1264-TRUE16-NEXT:    s_clause 0x1
+; GFX1264-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1264-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1264-TRUE16-NEXT:    s_mov_b64 s[8:9], 0
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    s_load_b32 s5, s[2:3], 0x0
+; GFX1264-TRUE16-NEXT:    s_and_b32 s10, s4, 0xffff0000
+; GFX1264-TRUE16-NEXT:    s_lshl_b32 s11, s4, 16
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s4, s2
+; GFX1264-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1264-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1264-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1264-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1264-TRUE16-NEXT:    v_add_f32_e32 v0, s11, v0
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_add_f32_e32 v2, s10, v2
+; GFX1264-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1264-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1264-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX1264-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1264-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1264-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
+; GFX1264-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1264-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1264-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1264-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-TRUE16-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX1264-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264-TRUE16-NEXT:    s_and_not1_b64 exec, exec, s[8:9]
+; GFX1264-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1264-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1264-TRUE16-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1264-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], null
+; GFX1264-TRUE16-NEXT:    s_endpgm
+;
+; GFX1264-FAKE16-LABEL: uniform_fadd_v2bf16:
+; GFX1264-FAKE16:       ; %bb.0:
+; GFX1264-FAKE16-NEXT:    s_clause 0x1
+; GFX1264-FAKE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1264-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x34
+; GFX1264-FAKE16-NEXT:    s_mov_b64 s[2:3], 0
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    s_load_b32 s1, s[10:11], 0x0
+; GFX1264-FAKE16-NEXT:    s_lshl_b32 s12, s0, 16
+; GFX1264-FAKE16-NEXT:    s_and_b32 s13, s0, 0xffff0000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1264-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1264-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1264-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1264-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1264-FAKE16-NEXT:    v_add_f32_e32 v0, s12, v0
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_add_f32_e32 v2, s13, v2
+; GFX1264-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1264-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1264-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX1264-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1264-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1264-FAKE16-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
+; GFX1264-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1264-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s[0:1]
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1264-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX1264-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1264-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1264-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX1264-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1264-FAKE16-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1264-FAKE16-NEXT:    s_and_not1_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1264-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1264-FAKE16-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1264-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], null
+; GFX1264-FAKE16-NEXT:    s_endpgm
+;
+; GFX1232-TRUE16-LABEL: uniform_fadd_v2bf16:
+; GFX1232-TRUE16:       ; %bb.0:
+; GFX1232-TRUE16-NEXT:    s_clause 0x1
+; GFX1232-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1232-TRUE16-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    s_load_b32 s5, s[2:3], 0x0
+; GFX1232-TRUE16-NEXT:    s_and_b32 s9, s4, 0xffff0000
+; GFX1232-TRUE16-NEXT:    s_lshl_b32 s10, s4, 16
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s4, s2
+; GFX1232-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s5, s3
+; GFX1232-TRUE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1232-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1232-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1232-TRUE16-NEXT:    v_add_f32_e32 v0, s10, v0
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT:    v_add_f32_e32 v2, s9, v2
+; GFX1232-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1232-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1232-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1232-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1232-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1232-TRUE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1232-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX1232-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX1232-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1232-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX1232-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-TRUE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-TRUE16-NEXT:    s_or_b32 s8, vcc_lo, s8
+; GFX1232-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s8
+; GFX1232-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1232-TRUE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1232-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1232-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1232-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], null
+; GFX1232-TRUE16-NEXT:    s_endpgm
+;
+; GFX1232-FAKE16-LABEL: uniform_fadd_v2bf16:
+; GFX1232-FAKE16:       ; %bb.0:
+; GFX1232-FAKE16-NEXT:    s_clause 0x1
+; GFX1232-FAKE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1232-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x34
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    s_load_b32 s4, s[10:11], 0x0
+; GFX1232-FAKE16-NEXT:    s_lshl_b32 s2, s0, 16
+; GFX1232-FAKE16-NEXT:    s_and_b32 s3, s0, 0xffff0000
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1232-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1232-FAKE16-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX1232-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1232-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX1232-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1232-FAKE16-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232-FAKE16-NEXT:    v_add_f32_e32 v2, s3, v2
+; GFX1232-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1232-FAKE16-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1232-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX1232-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX1232-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1232-FAKE16-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX1232-FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX1232-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v3, v5, s0
+; GFX1232-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX1232-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1232-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1232-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1232-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX1232-FAKE16-NEXT:    v_mov_b32_e32 v1, v2
+; GFX1232-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX1232-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX1232-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX1232-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX1232-FAKE16-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX1232-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX1232-FAKE16-NEXT:    buffer_store_b32 v2, off, s[8:11], null
+; GFX1232-FAKE16-NEXT:    s_endpgm
   %rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, <2 x bfloat> %val monotonic, align 4
   store <2 x bfloat> %rmw, ptr addrspace(1) %result
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1132_DPP-FAKE16: {{.*}}
+; GFX1132_DPP-TRUE16: {{.*}}
+; GFX1132_ITERATIVE-FAKE16: {{.*}}
+; GFX1132_ITERATIVE-TRUE16: {{.*}}
+; GFX1164_DPP-FAKE16: {{.*}}
+; GFX1164_DPP-TRUE16: {{.*}}
+; GFX1164_ITERATIVE-FAKE16: {{.*}}
+; GFX1164_ITERATIVE-TRUE16: {{.*}}
+; GFX1232_DPP-FAKE16: {{.*}}
+; GFX1232_DPP-TRUE16: {{.*}}
+; GFX1232_ITERATIVE-FAKE16: {{.*}}
+; GFX1232_ITERATIVE-TRUE16: {{.*}}
+; GFX1264_DPP-FAKE16: {{.*}}
+; GFX1264_DPP-TRUE16: {{.*}}
+; GFX1264_ITERATIVE-FAKE16: {{.*}}
+; GFX1264_ITERATIVE-TRUE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
index 2c6aabec76330..6b9016df5cd89 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast_vector_bigint.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 ; Make sure stack use isn't introduced for these bitcasts.
 
@@ -193,16 +194,30 @@ define <10 x i16> @bitcast_i160_to_v10i16(i160 %int) {
 ; GFX9-NEXT:    v_bfi_b32 v2, s4, v2, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: bitcast_i160_to_v10i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
-; GFX12-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: bitcast_i160_to_v10i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v5, v0
+; GFX12-TRUE16-NEXT:    v_bfi_b32 v2, 0xffff, v6, v2
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: bitcast_i160_to_v10i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v0
+; GFX12-FAKE16-NEXT:    v_bfi_b32 v2, 0xffff, v2, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i160 %int to <10 x i16>
   ret <10 x i16> %bitcast
 }
@@ -235,19 +250,33 @@ define i12 @bitcast_v2i6_to_i12(<2 x i6> %vec) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xfff, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: bitcast_v2i6_to_i12:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b16 v1, 6, v1
-; GFX12-NEXT:    v_and_b32_e32 v0, 63, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xfff, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: bitcast_v2i6_to_i12:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 6, v1.l
+; GFX12-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 63
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    v_and_b16 v0.l, 0xfff, v0.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: bitcast_v2i6_to_i12:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b16 v1, 6, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xfff, v0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast <2 x i6> %vec to i12
   ret i12 %bitcast
 }
@@ -262,18 +291,31 @@ define <2 x i6> @bitcast_i12_to_v2i6(i12 %int) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: bitcast_i12_to_v2i6:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b16 v1, 6, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 63, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v1, 63, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: bitcast_i12_to_v2i6:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 6, v0.l
+; GFX12-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 63
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_and_b16 v1.l, v0.h, 63
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: bitcast_i12_to_v2i6:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b16 v1, 6, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 63, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 63, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bitcast = bitcast i12 %int to <2 x i6>
   ret <2 x i6> %bitcast
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index bb7974335bf28..a9358dc4a51d8 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -3,8 +3,10 @@
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-FLAT
-; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-GISEL
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-FLAT,GFX11-FLAT-TRUE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-FLAT,GFX11-FLAT-FAKE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-GISEL,GFX11-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11-GISEL,GFX11-GISEL-FAKE16
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
@@ -71,21 +73,37 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
 ; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: s_brev_i16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_clause 0x1
-; GFX11-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
-; GFX11-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-GISEL-TRUE16-LABEL: s_brev_i16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_brev_b32 s2, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: s_brev_i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_brev_b32 s2, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, ptr addrspace(1) %out
   ret void
@@ -160,16 +178,29 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa
 ; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
 ; GFX11-FLAT-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: v_brev_i16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
-; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-GISEL-TRUE16-LABEL: v_brev_i16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfrev_b32_e32 v1, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-FAKE16-LABEL: v_brev_i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_bfrev_b32_e32 v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-GISEL-FAKE16-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, ptr addrspace(1) %out
@@ -871,23 +902,40 @@ define float @missing_truncate_promote_bitreverse(i32 %arg) {
 ; GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FLAT-LABEL: missing_truncate_promote_bitreverse:
-; GFX11-FLAT:       ; %bb.0: ; %bb
-; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-FLAT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-FLAT-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-FLAT-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: missing_truncate_promote_bitreverse:
-; GFX11-GISEL:       ; %bb.0: ; %bb
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FLAT-TRUE16-LABEL: missing_truncate_promote_bitreverse:
+; GFX11-FLAT-TRUE16:       ; %bb.0: ; %bb
+; GFX11-FLAT-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLAT-TRUE16-NEXT:    v_bfrev_b32_e32 v0, v0
+; GFX11-FLAT-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLAT-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-FLAT-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FLAT-FAKE16-LABEL: missing_truncate_promote_bitreverse:
+; GFX11-FLAT-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FLAT-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLAT-FAKE16-NEXT:    v_bfrev_b32_e32 v0, v0
+; GFX11-FLAT-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLAT-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FLAT-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FLAT-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: missing_truncate_promote_bitreverse:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfrev_b32_e32 v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: missing_truncate_promote_bitreverse:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %bb
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_bfrev_b32_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = trunc i32 %arg to i16
   %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 6be80d2f5957b..fb4a981ec17d1 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s
 
 declare hidden void @external_void_func_i1(i1) #0
@@ -4855,22 +4856,40 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
-; GFX11-NEXT:    s_mov_b32 s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, off
-; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:4
-; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 16
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-TRUE16-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 16
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-FAKE16-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
@@ -5021,36 +5040,68 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
-; GFX11-NEXT:    s_mov_b32 s32, 32
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, off
-; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:4
-; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
-; GFX11-NEXT:    v_mov_b32_e32 v0, 8
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u8 v0, off, off offset:8
-; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:12
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 32
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-TRUE16-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 8
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, off offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, off offset:12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 32
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-FAKE16-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b8 off, v0, off
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v1, off offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b64 v[0:1], off, off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 8
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v0, off, off offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, off offset:12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; HSA:       ; %bb.0:
@@ -5230,37 +5281,69 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_call_external_void_func_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16
-; GFX11-NEXT:    v_mov_b32_e32 v8, v2
-; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
-; GFX11-NEXT:    v_mov_b32_e32 v2, v17
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-TRUE16-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_b128 v[16:19], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-FAKE16-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v17
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_v16i8:
 ; HSA:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index da52474f08fbd..0c335e45c9e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
 
 ; Make sure we don't crash or assert on spir_kernel calling convention.
 
@@ -60,10 +61,15 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: ps_ret_cc_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: ps_ret_cc_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: ps_ret_cc_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -81,10 +87,15 @@ define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
 ; VI-NEXT:    v_add_f16_e64 v0, s0, 1.0
 ; VI-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: ps_ret_cc_inreg_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f16_e64 v0, s0, 1.0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: ps_ret_cc_inreg_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, s0, 1.0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: ps_ret_cc_inreg_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v0, s0, 1.0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -311,10 +322,15 @@ define amdgpu_cs half @cs_mesa(half %arg0) {
 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: cs_mesa:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: cs_mesa:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: cs_mesa:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -333,10 +349,15 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) {
 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: ps_mesa_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: ps_mesa_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: ps_mesa_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -355,10 +376,15 @@ define amdgpu_vs half @vs_mesa(half %arg0) {
 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: vs_mesa:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: vs_mesa:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: vs_mesa:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -377,10 +403,15 @@ define amdgpu_gs half @gs_mesa(half %arg0) {
 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: gs_mesa:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gs_mesa:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gs_mesa:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -399,10 +430,15 @@ define amdgpu_hs half @hs_mesa(half %arg0) {
 ; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; VI-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: hs_mesa:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: hs_mesa:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: hs_mesa:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -940,11 +976,17 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) {
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: ps_mesa_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: ps_mesa_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: ps_mesa_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
   %add = add i16 %arg0, %arg0
   store i16 %add, ptr addrspace(1) poison
   ret void
@@ -2232,35 +2274,65 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
 ; VI-NEXT:    flat_store_byte v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: amdgpu_cs_v8i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
-; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 4, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: amdgpu_cs_v8i1:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, v6.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 1, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v4.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v2.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 1, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 3, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 2, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 2, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v5.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v0.h, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 4, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: amdgpu_cs_v8i1:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v7, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 4, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
   store <8 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -2367,60 +2439,115 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: amdgpu_cs_v16i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    v_lshlrev_b16 v13, 1, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v11
-; GFX11-NEXT:    v_lshlrev_b16 v10, 2, v10
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
-; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b16 v15, 3, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 2, v14
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v10, v11, v10
-; GFX11-NEXT:    v_and_b32_e32 v1, 3, v8
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, v15, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 3, v12
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v10
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 15, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b16 v3, 4, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b16 v2, 12, v2
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: amdgpu_cs_v16i1:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, v10.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 1, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, v8.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, v6.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 1, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v4.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v2.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 1, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, v14.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 1, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, v12.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 3, v11.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 2, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 3, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 2, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 2, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 3, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 2, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v12.l, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, v5.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v7.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v0.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v12.h, v10.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, v8.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v3.l, v2.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, 15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 4, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 12, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: amdgpu_cs_v16i1:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 1, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 1, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 3, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 2, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 3, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 2, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v11, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 3, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v15, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 3, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v6, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v2, 12, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
   store <16 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -2619,107 +2746,209 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
 ; VI-NEXT:    flat_store_dword v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: amdgpu_cs_v32i1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v11
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshlrev_b16 v10, 2, v10
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    v_lshlrev_b16 v13, 1, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
-; GFX11-NEXT:    v_and_b32_e32 v8, 3, v8
-; GFX11-NEXT:    v_lshlrev_b16 v7, 3, v7
-; GFX11-NEXT:    v_lshlrev_b16 v6, 2, v6
-; GFX11-NEXT:    v_lshlrev_b16 v3, 3, v3
-; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b16 v15, 3, v15
-; GFX11-NEXT:    v_lshlrev_b16 v14, 2, v14
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v6, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX11-NEXT:    v_or_b32_e32 v13, v15, v14
-; GFX11-NEXT:    v_and_b32_e32 v12, 3, v12
-; GFX11-NEXT:    v_and_b32_e32 v3, 15, v6
-; GFX11-NEXT:    v_lshlrev_b16 v6, 1, v29
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v26
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v28
-; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v25
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v24
-; GFX11-NEXT:    v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT:    v_or_b32_e32 v1, v12, v13
-; GFX11-NEXT:    v_lshlrev_b16 v8, 3, v27
-; GFX11-NEXT:    v_lshlrev_b16 v2, 2, v2
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, v10, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v22
-; GFX11-NEXT:    v_lshlrev_b16 v10, 1, v21
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v20
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v18
-; GFX11-NEXT:    v_lshlrev_b16 v14, 1, v17
-; GFX11-NEXT:    v_and_b32_e32 v15, 1, v16
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v30
-; GFX11-NEXT:    v_or_b32_e32 v2, v8, v2
-; GFX11-NEXT:    v_lshlrev_b16 v8, 3, v23
-; GFX11-NEXT:    v_lshlrev_b16 v9, 2, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v10
-; GFX11-NEXT:    v_lshlrev_b16 v12, 3, v19
-; GFX11-NEXT:    v_lshlrev_b16 v13, 2, v13
-; GFX11-NEXT:    v_or_b32_e32 v14, v15, v14
-; GFX11-NEXT:    v_lshlrev_b16 v11, 3, v31
-; GFX11-NEXT:    v_lshlrev_b16 v5, 2, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 3, v7
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 3, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, v12, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 3, v14
-; GFX11-NEXT:    v_or_b32_e32 v5, v11, v5
-; GFX11-NEXT:    v_and_b32_e32 v6, 3, v6
-; GFX11-NEXT:    v_or_b32_e32 v2, v7, v2
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v12, v10
-; GFX11-NEXT:    v_lshlrev_b16 v4, 4, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX11-NEXT:    v_lshlrev_b16 v6, 4, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 15, v8
-; GFX11-NEXT:    v_lshlrev_b16 v1, 12, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-NEXT:    v_lshlrev_b16 v4, 12, v5
-; GFX11-NEXT:    v_lshlrev_b16 v2, 8, v2
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: amdgpu_cs_v32i1:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, v10.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 1, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, v8.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 3, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.h, v6.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.l, 2, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 1, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, v4.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 3, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 2, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, v6.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v2.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v6.l, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v5.h, v9.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v0.h, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 1, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, v28.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, v26.l, 1
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, v3.l, 15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 1, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 2, v1.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.l, 2, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 1, v25.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 3, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, v24.l, 1
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 1, v21.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, v22.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, v20.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, v18.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 1, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, v16.l, 1
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, v14.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 1, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, v12.l, 1
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, v30.l, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 3, v23.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 2, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v6.l, v5.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 3, v19.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 2, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 3, v15.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 2, v14.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v31.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 2, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, v4.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.l, v5.h, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, v7.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v12.h, v13.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, v12.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, v3.l, 3
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v3.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v5.l, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v6.l, v5.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v10.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 4, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 15
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, v3.l, 15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 4, v3.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, v4.l, 15
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 12, v8.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 12, v1.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: amdgpu_cs_v32i1:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 1, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 3, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 2, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 1, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 3, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 3, v15
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 2, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 3, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 15, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 1, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v26
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 1, v28
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 1, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 1, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 3, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v12, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 3, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v10, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 1, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 1, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 1, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 1, v18
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 1, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 1, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 1, v30
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 3, v23
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 2, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v12, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 3, v19
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 2, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v15, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 3, v31
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 2, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 3, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 3, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v12, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 3, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 3, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v7, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v12, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v4, 4, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v6, 4, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v4, 12, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
   store <32 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 81f79ea3c3fa8..f7c58ca9599b4 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
 ; SI-LABEL: v_clamp_add_src_f32:
@@ -482,18 +483,31 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_clamp_add_src_f16_denorm:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_clamp_add_src_f16_denorm:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 clamp
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_clamp_add_src_f16_denorm:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
   %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -552,18 +566,31 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_clamp_add_src_f16_no_denormals:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 clamp
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_clamp_add_src_f16_no_denormals:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v1, v1, 1.0 clamp
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
   %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -1525,22 +1552,39 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v1, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_no_clamp_add_src_v2f16_f16_src:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0 clamp
+; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_no_clamp_add_src_v2f16_f16_src:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v1, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
+; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
   %out.gep = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 17c84d7371de1..3e0837b58aafc 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #0 {
 ; GFX6-LABEL: v_clamp_f32:
@@ -585,31 +587,57 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_clamp_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: v_clamp_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, v1, v1 clamp
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_clamp_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, v0.l, v0.l clamp
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_clamp_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, v1, v1 clamp
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: v_clamp_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, v0.l, v0.l clamp
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: v_clamp_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, v1, v1 clamp
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
   %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -667,31 +695,57 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_clamp_neg_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: v_clamp_neg_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1 clamp
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_clamp_neg_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l clamp
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_clamp_neg_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -v1, -v1 clamp
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: v_clamp_neg_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, -v0.l, -v0.l clamp
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: v_clamp_neg_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, -v1, -v1 clamp
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
   %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -750,31 +804,57 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_clamp_negabs_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: v_clamp_negabs_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_clamp_negabs_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l| clamp
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_clamp_negabs_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1| clamp
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: v_clamp_negabs_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, -|v0.l|, -|v0.l| clamp
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: v_clamp_negabs_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid
   %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 10d71a315fbf9..9d679779fed0e 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck %s -check-prefixes=GCN,GFX11
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -enable-no-nans-fp-math < %s | FileCheck %s -check-prefixes=GCN,GFX11NONANS
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck %s -check-prefixes=GCN,GFX11,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 < %s | FileCheck %s -check-prefixes=GCN,GFX11,GFX11-FAKE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -enable-no-nans-fp-math < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16,GFX11NONANS,GFX11NONANS-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -amdgpu-enable-delay-alu=0 -enable-no-nans-fp-math < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16,GFX11NONANS,GFX11NONANS-FAKE16
 
 ; The tests check the following optimization of DAGCombiner:
 ; CMP(A,C)||CMP(B,C) => CMP(MIN/MAX(A,B), C)
@@ -861,13 +863,21 @@ define i1 @test58(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test58:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test58:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test58:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
   %and1  = and i1 %cmp1, %cmp2
@@ -883,13 +893,21 @@ define i1 @test59(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test59:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test59:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test59:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
   %and1  = and i1 %cmp1, %cmp2
@@ -905,13 +923,21 @@ define i1 @test60(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test60:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test60:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test60:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
   %and1  = and i1 %cmp1, %cmp2
@@ -927,13 +953,21 @@ define i1 @test61(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test61:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test61:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test61:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
   %and1 = and i1 %cmp1, %cmp2
@@ -1090,13 +1124,21 @@ define i1 @test70(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test70:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test70:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test70:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp olt float %var1, %arg3
@@ -1151,13 +1193,21 @@ define i1 @test73(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test73:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test73:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test73:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp oge float %var1, %arg3
@@ -1177,15 +1227,25 @@ define i1 @test74(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test74:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test74:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-TRUE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-TRUE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test74:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-FAKE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-FAKE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
   %cmp1 = fcmp ugt double %var1, %arg3
@@ -1204,13 +1264,21 @@ define i1 @test75(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test75:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test75:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test75:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp uge float %var1, %arg3
@@ -1229,13 +1297,21 @@ define i1 @test76(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test76:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test76:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test76:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp ule float %var1, %arg3
@@ -1255,15 +1331,25 @@ define i1 @test77(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test77:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test77:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-TRUE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test77:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-FAKE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
   %cmp1 = fcmp ult double %var1, %arg3
@@ -1295,13 +1381,21 @@ define i1 @test79(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test79:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test79:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test79:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
   %and1  = and i1 %cmp1, %cmp2
@@ -1371,13 +1465,21 @@ define i1 @test83(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test83:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test83:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test83:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp ule float %var1, %arg3
@@ -1387,23 +1489,41 @@ define i1 @test83(float %arg1, float %arg2, float %arg3) {
 }
 
 define i1 @test84(half %arg1, half %arg2, half %arg3) {
-; GFX11-LABEL: test84:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test84:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test84:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test84:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test84:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GCN-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test84:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
   %cmp1 = fcmp olt half %var1, %arg3
@@ -1413,31 +1533,53 @@ define i1 @test84(half %arg1, half %arg2, half %arg3) {
 }
 
 define <2 x i1> @test85(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
-; GFX11-LABEL: test85:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test85:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-TRUE16-NEXT:    v_pk_min_f16 v1, v0, v1
+; GFX11-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test85:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11NONANS-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v1
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test85:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test85:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_pk_min_f16 v1, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1.h, v2.h
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test85:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GCN-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v1
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
   %cmp1 = fcmp ole <2 x half> %var1, %arg3
@@ -1447,31 +1589,53 @@ define <2 x i1> @test85(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 }
 
 define <2 x i1> @test86(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
-; GFX11-LABEL: test86:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test86:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v0, v1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test86:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11NONANS-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test86:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test86:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_pk_max_f16 v1, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1.h, v2.h
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test86:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GCN-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
   %cmp1 = fcmp ogt <2 x half> %var1, %arg3
@@ -1481,23 +1645,41 @@ define <2 x i1> @test86(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 }
 
 define i1 @test87(half %arg1, half %arg2, half %arg3) {
-; GFX11-LABEL: test87:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test87:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test87:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test87:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test87:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GCN-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test87:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
   %cmp1 = fcmp oge half %var1, %arg3
@@ -1507,31 +1689,53 @@ define i1 @test87(half %arg1, half %arg2, half %arg3) {
 }
 
 define <2 x i1> @test88(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
-; GFX11-LABEL: test88:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test88:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-TRUE16-NEXT:    v_pk_min_f16 v1, v0, v1
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test88:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11NONANS-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test88:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test88:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_pk_min_f16 v1, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1.h, v2.h
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test88:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GCN-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v1
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
   %cmp1 = fcmp ugt <2 x half> %var1, %arg3
@@ -1541,23 +1745,41 @@ define <2 x i1> @test88(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
 }
 
 define i1 @test89(half %arg1, half %arg2, half %arg3) {
-; GFX11-LABEL: test89:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test89:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test89:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test89:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test89:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GCN-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test89:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
   %cmp1 = fcmp uge half %var1, %arg3
@@ -1567,23 +1789,41 @@ define i1 @test89(half %arg1, half %arg2, half %arg3) {
 }
 
 define i1 @test90(half %arg1, half %arg2, half %arg3) {
-; GFX11-LABEL: test90:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test90:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test90:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test90:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test90:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GCN-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test90:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call half @llvm.canonicalize.f16(half %arg1)
   %var2 = call half @llvm.canonicalize.f16(half %arg2)
   %cmp1 = fcmp ule half %var1, %arg3
@@ -1593,31 +1833,53 @@ define i1 @test90(half %arg1, half %arg2, half %arg3) {
 }
 
 define <2 x i1> @test91(<2 x half> %arg1, <2 x half> %arg2, <2 x half> %arg3) {
-; GFX11-LABEL: test91:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test91:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v0, v1
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v1.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test91:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11NONANS-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11NONANS-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v1
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: test91:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test91:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_pk_max_f16 v1, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1.l, v2.l
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1.h, v2.h
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test91:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GCN-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GCN-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v1
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg1)
   %var2 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %arg2)
   %cmp1 = fcmp ult <2 x half> %var1, %arg3
@@ -2021,13 +2283,21 @@ define i1 @test108(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test108:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test108:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max3_f32 v0, v0, v1, v2
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test108:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max3_f32 v0, v0, v1, v2
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
   %cmp3 = fcmp ult float %arg3, %C
@@ -2049,15 +2319,25 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test109:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
-; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
-; GFX11NONANS-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test109:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GCN-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
+; GCN-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test109:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GCN-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v1, v4
+; GCN-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
   %cmp3 = fcmp ogt float %arg3, %C
@@ -2107,16 +2387,27 @@ define i1 @test111(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test111:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test111:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v3
+; GCN-TRUE16-NEXT:    v_min3_f32 v0, v0, v1, v2
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-TRUE16-NEXT:    v_min3_f32 v0, v5, v6, v0
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test111:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v2, v2, v3
+; GCN-FAKE16-NEXT:    v_min3_f32 v0, v0, v1, v2
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-FAKE16-NEXT:    v_min3_f32 v0, v5, v6, v0
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
   %or1  = or i1 %cmp1, %cmp2
@@ -2150,16 +2441,27 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test112:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX11NONANS-NEXT:    v_min3_f32 v0, v5, v6, v0
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test112:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v3
+; GCN-TRUE16-NEXT:    v_min3_f32 v0, v0, v1, v2
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-TRUE16-NEXT:    v_min3_f32 v0, v5, v6, v0
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test112:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v2, v2, v3
+; GCN-FAKE16-NEXT:    v_min3_f32 v0, v0, v1, v2
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-FAKE16-NEXT:    v_min3_f32 v0, v5, v6, v0
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v8
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
   %or1  = or i1 %cmp1, %cmp2
@@ -2190,13 +2492,21 @@ define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test113:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_maxmin_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test113:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test113:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_maxmin_f32 v0, v0, v1, v2
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v3
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %C
   %cmp2 = fcmp ult float %arg2, %C
   %cmp3 = fcmp olt float %arg3, %C
@@ -2217,15 +2527,25 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test114:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
-; GFX11NONANS-NEXT:    s_and_b32 s0, s0, vcc_lo
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test114:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GCN-TRUE16-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
+; GCN-TRUE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test114:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GCN-FAKE16-NEXT:    v_cmp_gt_f32_e64 s0, v0, v3
+; GCN-FAKE16-NEXT:    s_and_b32 s0, s0, vcc_lo
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt float %arg1, %C
   %cmp2 = fcmp ogt float %arg2, %C
   %cmp3 = fcmp ult float %arg3, %C
@@ -2247,14 +2567,23 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test115:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test115:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v3
+; GCN-TRUE16-NEXT:    v_min3_f32 v0, v0, v1, v2
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test115:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v2, v2, v3
+; GCN-FAKE16-NEXT:    v_min3_f32 v0, v0, v1, v2
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v4
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
   %var3 = call float @llvm.canonicalize.f32(float %arg3)
@@ -2290,22 +2619,39 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test116:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v8, v8, v9
-; GFX11NONANS-NEXT:    v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5
-; GFX11NONANS-NEXT:    v_max_f32_e32 v4, v6, v7
-; GFX11NONANS-NEXT:    v_min3_f32 v0, v0, v1, v8
-; GFX11NONANS-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v10
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v3, v10
-; GFX11NONANS-NEXT:    v_cmp_gt_f32_e64 s1, v4, v10
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s2, v0, v10
-; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
-; GFX11NONANS-NEXT:    s_or_b32 s1, s2, vcc_lo
-; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test116:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v8, v8, v9
+; GCN-TRUE16-NEXT:    v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v4, v6, v7
+; GCN-TRUE16-NEXT:    v_min3_f32 v0, v0, v1, v8
+; GCN-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v10
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v10
+; GCN-TRUE16-NEXT:    v_cmp_gt_f32_e64 s1, v4, v10
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v10
+; GCN-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GCN-TRUE16-NEXT:    s_or_b32 s1, s2, vcc_lo
+; GCN-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test116:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v8, v8, v9
+; GCN-FAKE16-NEXT:    v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v4, v6, v7
+; GCN-FAKE16-NEXT:    v_min3_f32 v0, v0, v1, v8
+; GCN-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v10
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v3, v10
+; GCN-FAKE16-NEXT:    v_cmp_gt_f32_e64 s1, v4, v10
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v10
+; GCN-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GCN-FAKE16-NEXT:    s_or_b32 s1, s2, vcc_lo
+; GCN-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C
   %cmp2 = fcmp olt float %arg2, %C
   %cmp3 = fcmp ogt float %arg3, %C
@@ -2351,23 +2697,41 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test117:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v6, v6, v7
-; GFX11NONANS-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11
-; GFX11NONANS-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX11NONANS-NEXT:    v_min3_f32 v3, v4, v5, v6
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v12
-; GFX11NONANS-NEXT:    v_min3_f32 v0, v8, v9, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s0, v2, v13
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s1, v3, v13
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e64 s2, v0, v12
-; GFX11NONANS-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GFX11NONANS-NEXT:    s_or_b32 s0, s0, s1
-; GFX11NONANS-NEXT:    s_or_b32 s0, s2, s0
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test117:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v6, v6, v7
+; GCN-TRUE16-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v2, v2, v3
+; GCN-TRUE16-NEXT:    v_min3_f32 v3, v4, v5, v6
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v12
+; GCN-TRUE16-NEXT:    v_min3_f32 v0, v8, v9, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e64 s0, v2, v13
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e64 s1, v3, v13
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v12
+; GCN-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GCN-TRUE16-NEXT:    s_or_b32 s0, s0, s1
+; GCN-TRUE16-NEXT:    s_or_b32 s0, s2, s0
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test117:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v6, v6, v7
+; GCN-FAKE16-NEXT:    v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v2, v2, v3
+; GCN-FAKE16-NEXT:    v_min3_f32 v3, v4, v5, v6
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v12
+; GCN-FAKE16-NEXT:    v_min3_f32 v0, v8, v9, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e64 s0, v2, v13
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e64 s1, v3, v13
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e64 s2, v0, v12
+; GCN-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GCN-FAKE16-NEXT:    s_or_b32 s0, s0, s1
+; GCN-FAKE16-NEXT:    s_or_b32 s0, s2, s0
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %C1
   %cmp2 = fcmp olt float %arg2, %C1
   %cmp3 = fcmp olt float %arg3, %C2
@@ -2635,14 +2999,41 @@ define i1 @test130(i32 %arg1, i32 %arg2, i32 %arg3) {
 }
 
 define i1 @test131(i16 %arg1, i32 %arg2) {
-; GCN-LABEL: test131:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 10, v0
-; GCN-NEXT:    v_cmp_gt_u32_e64 s0, 10, v1
-; GCN-NEXT:    s_or_b32 s0, vcc_lo, s0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test131:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 10, v1
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e64 s0, 10, v0.l
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test131:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 10, v0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u32_e64 s0, 10, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-TRUE16-LABEL: test131:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 10, v1
+; GCN-TRUE16-NEXT:    v_cmp_gt_u16_e64 s0, 10, v0.l
+; GCN-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test131:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 10, v0
+; GCN-FAKE16-NEXT:    v_cmp_gt_u32_e64 s0, 10, v1
+; GCN-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = icmp ult i16 %arg1, 10
   %cmp2 = icmp ult i32 %arg2, 10
   %or = or i1 %cmp1, %cmp2
@@ -2695,13 +3086,21 @@ define i1 @test134(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test134:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test134:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test134:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp ogt float %arg3, %arg2
   %and1  = and i1 %cmp1, %cmp2
@@ -2718,13 +3117,21 @@ define i1 @test135(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test135:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test135:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test135:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult float %arg1, %arg3
   %cmp2 = fcmp ugt float %arg3, %arg2
   %or1  = or i1 %cmp1, %cmp2
@@ -2743,15 +3150,25 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test136:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test136:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-TRUE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test136:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-FAKE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
   %cmp1 = fcmp ole double %var1, %arg3
@@ -2771,13 +3188,21 @@ define i1 @test137(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test137:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test137:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test137:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp ule float %var1, %arg3
@@ -2796,13 +3221,21 @@ define i1 @test138(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test138:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test138:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test138:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp olt float %arg1, %arg3
   %cmp2 = fcmp olt float %arg2, %arg3
   %and1  = and i1 %cmp1, %cmp2
@@ -2819,13 +3252,21 @@ define i1 @test139(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test139:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test139:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test139:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ole double %arg1, %arg3
   %cmp2 = fcmp ole double %arg2, %arg3
   %and1  = and i1 %cmp1, %cmp2
@@ -2842,13 +3283,21 @@ define i1 @test140(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test140:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test140:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test140:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ogt double %arg1, %arg3
   %cmp2 = fcmp ogt double %arg2, %arg3
   %and1  = and i1 %cmp1, %cmp2
@@ -2865,13 +3314,21 @@ define i1 @test141(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test141:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test141:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test141:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp oge float %arg1, %arg3
   %cmp2 = fcmp oge float %arg2, %arg3
   %and1  = and i1 %cmp1, %cmp2
@@ -2888,13 +3345,21 @@ define i1 @test142(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test142:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test142:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test142:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ugt double %arg1, %arg3
   %cmp2 = fcmp ugt double %arg2, %arg3
   %or1  = or i1 %cmp1, %cmp2
@@ -2911,13 +3376,21 @@ define i1 @test143(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test143:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test143:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test143:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp uge float %arg1, %arg3
   %cmp2 = fcmp uge float %arg2, %arg3
   %or1  = or i1 %cmp1, %cmp2
@@ -2934,13 +3407,21 @@ define i1 @test144(float %arg1, float %arg2, float %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test144:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test144:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test144:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ule float %arg1, %arg3
   %cmp2 = fcmp ule float %arg2, %arg3
   %or1  = or i1 %cmp1, %cmp2
@@ -2957,13 +3438,21 @@ define i1 @test145(double %arg1, double %arg2, double %arg3) #0 {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test145:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test145:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test145:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp1 = fcmp ult double %arg1, %arg3
   %cmp2 = fcmp ult double %arg2, %arg3
   %or1 = or i1 %cmp1, %cmp2
@@ -2981,13 +3470,21 @@ define i1 @test146(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test146:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test146:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test146:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp olt float %var1, %arg3
@@ -3008,15 +3505,25 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test147:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test147:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-TRUE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test147:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-FAKE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
   %cmp1 = fcmp ole double %var1, %arg3
@@ -3037,15 +3544,25 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test148:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test148:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-TRUE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-TRUE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test148:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-FAKE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-FAKE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
   %cmp1 = fcmp ogt double %var1, %arg3
@@ -3065,13 +3582,21 @@ define i1 @test149(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test149:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test149:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test149:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp oge float %var1, %arg3
@@ -3092,15 +3617,25 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test150:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test150:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-TRUE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test150:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-FAKE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
   %cmp1 = fcmp ugt double %var1, %arg3
@@ -3120,13 +3655,21 @@ define i1 @test151(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test151:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test151:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test151:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_ge_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp uge float %var1, %arg3
@@ -3146,13 +3689,21 @@ define i1 @test152(float %arg1, float %arg2, float %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test152:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11NONANS-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test152:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test152:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, v0, v2
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call float @llvm.canonicalize.f32(float %arg1)
   %var2 = call float @llvm.canonicalize.f32(float %arg2)
   %cmp1 = fcmp ule float %var1, %arg3
@@ -3173,15 +3724,25 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11NONANS-LABEL: test153:
-; GFX11NONANS:       ; %bb.0:
-; GFX11NONANS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11NONANS-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11NONANS-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11NONANS-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11NONANS-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX11NONANS-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11NONANS-NEXT:    s_setpc_b64 s[30:31]
+; GCN-TRUE16-LABEL: test153:
+; GCN-TRUE16:       ; %bb.0:
+; GCN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-TRUE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-TRUE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-TRUE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: test153:
+; GCN-FAKE16:       ; %bb.0:
+; GCN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GCN-FAKE16-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GCN-FAKE16-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GCN-FAKE16-NEXT:    v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5]
+; GCN-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GCN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %var1 = call double @llvm.canonicalize.f64(double %arg1)
   %var2 = call double @llvm.canonicalize.f64(double %arg2)
   %cmp1 = fcmp ult double %var1, %arg3
@@ -3197,3 +3758,7 @@ declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>)
 
 attributes #0 = { nounwind "amdgpu-ieee"="false" }
 attributes #1 = { nounwind "unsafe-fp-math"="true" "no-nans-fp-math"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11NONANS: {{.*}}
+; GFX11NONANS-FAKE16: {{.*}}
+; GFX11NONANS-TRUE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index b4d450a90d595..00f74f50a4b8b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -4,7 +4,8 @@
 ; RUN: llc < %s -mtriple=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16
 
 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
@@ -1814,18 +1815,31 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7f, v0
-; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_ctlz_i7_sel_eq_neg1:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_clz_i32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x7f, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_ctlz_i7_sel_eq_neg1:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u8 v0, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_clz_i32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7f, v0
+; GFX11-FAKE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
   %val = load i7, ptr addrspace(1) %valptr.gep

diff  --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 2ec6f7ab7602b..e5d1fd4a0ea85 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
@@ -298,14 +299,23 @@ define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uitofp_v2i8_to_v2f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_v2i8_to_v2f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte1_e32 v1, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_v2i8_to_v2f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = bitcast i16 %arg0 to <2 x i8>
   %cvt = uitofp <2 x i8> %val to <2 x float>
   ret <2 x float> %cvt
@@ -493,13 +503,21 @@ define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uitofp_i32_to_f16_mask255:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_i32_to_f16_mask255:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_i32_to_f16_mask255:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = uitofp i32 %masked to half
   ret half %cvt
@@ -535,13 +553,21 @@ define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_sitofp_i32_to_f16_mask255:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_sitofp_i32_to_f16_mask255:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_sitofp_i32_to_f16_mask255:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %masked = and i32 %arg0, 255
   %cvt = sitofp i32 %masked to half
   ret half %cvt
@@ -577,13 +603,21 @@ define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uitofp_to_f16_lshr8_mask255:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_to_f16_lshr8_mask255:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_to_f16_lshr8_mask255:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %lshr.8 = lshr i32 %arg0, 8
   %masked = and i32 %lshr.8, 255
   %cvt = uitofp i32 %masked to half
@@ -620,13 +654,21 @@ define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uitofp_to_f16_lshr16_mask255:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_to_f16_lshr16_mask255:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_to_f16_lshr16_mask255:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 16
   %masked = and i32 %lshr.16, 255
   %cvt = uitofp i32 %masked to half
@@ -663,13 +705,21 @@ define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uitofp_to_f16_lshr24_mask255:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_to_f16_lshr24_mask255:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_to_f16_lshr24_mask255:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %lshr.16 = lshr i32 %arg0, 24
   %masked = and i32 %lshr.16, 255
   %cvt = uitofp i32 %masked to half
@@ -703,13 +753,21 @@ define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
 ; GFX9-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uitofp_i8_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_i8_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_i8_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to half
   ret half %cvt
 }
@@ -885,14 +943,23 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
 ; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_uitofp_i8_to_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_i8_to_f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_uitofp_i8_to_f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cvt = uitofp i8 %arg0 to double
   ret double %cvt
 }
@@ -1700,45 +1767,82 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_add_nc_u16 v2, v0, 9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff00, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u16 v3, v1, 9
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff00, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x900, v2
-; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_nc_u16 v1, 0x900, v1
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v2
-; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    global_store_b32 v4, v5, s[2:3]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: load_v4i8_to_v4f32_2_uses:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, 9
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.h, v4.h, 9
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff00, v4.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff00, v4.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x900, v0.l
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v1.l, 0x900, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
+; GFX11-TRUE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v6, v7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    global_store_b32 v5, v4, s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: load_v4i8_to_v4f32_2_uses:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, v0, 9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffffff00, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v3, v1, 9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffffff00, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x900, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v1, 0x900, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    global_store_b32 v4, v5, s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
   %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 993f162921663..ce7281702c108 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -5,8 +5,10 @@
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
 
 define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX7-LABEL: fmul_select_f32_test1:
@@ -1594,25 +1596,45 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f16_test1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_f16_test1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1682,27 +1704,49 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f16_test2:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_f16_test2:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test2:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test2:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test2:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test2:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 5.000000e-01, half 1.000000e+00
   %ldexp = fmul half %x, %y
@@ -1797,39 +1841,69 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_v2f16_test3:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_v2f16_test3:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2f16_test3:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x4000
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3c00, v2.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3c00, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2f16_test3:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2f16_test3:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2f16_test3:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -1924,39 +1998,69 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_v2f16_test4:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_v2f16_test4:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2f16_test4:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3c00, v2.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3c00, v2.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2f16_test4:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2f16_test4:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2f16_test4:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x half> <half 5.000000e-01, half 5.000000e-01>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
   %ldexp = fmul <2 x half> %x, %y
@@ -2000,14 +2104,41 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_f16_test5:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test5:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test5:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test5:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test5:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 2.000000e+00, half 8.000000e+00
   %ldexp = fmul half %x, %y
@@ -2078,25 +2209,45 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f16_test6:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_f16_test6:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4200
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test6:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x4200, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test6:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test6:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x4200
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, 0xc800, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test6:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc800, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -8.000000e+00, half 3.000000e+00
   %ldexp = fmul half %x, %y
@@ -2166,25 +2317,45 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f16_test7:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_f16_test7:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc400
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test7:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0xc400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test7:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test7:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0xc400
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.h, 0x4800, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test7:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4800, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 8.000000e+00, half -4.000000e+00
   %ldexp = fmul half %x, %y
@@ -2232,14 +2403,41 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_f16_test8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test8:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test8:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test8:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0, 0x8000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test8:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -0.000000e+00, half 0.000000e+00
   %ldexp = fmul half %x, %y
@@ -2307,27 +2505,49 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f16_test9:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_f16_test9:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test9:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test9:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test9:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test9:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 5, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01
   %ldexp = fmul half %x, %y
@@ -2392,25 +2612,45 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a
 ; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH1000, half 0xH6800
   %ldexp = fmul half %x, %y
@@ -2475,25 +2715,45 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
 ; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, half 0xH5800, half 0xH0400
   %ldexp = fmul half %x, %y
@@ -2548,25 +2808,87 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2621,25 +2943,87 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test2:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test2:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test2:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test2:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2721,37 +3105,135 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_v2bf16_test3:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test3:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x4000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test3:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test3:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x4000, s0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test3:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -2833,37 +3315,135 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_v2bf16_test4:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test4:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x3f00, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test4:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test4:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0x3f80
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v2.l, v5.l, 0x3f00, s0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test4:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -2919,25 +3499,87 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test5:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test5:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4100
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test5:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test5:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4100
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test5:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2994,25 +3636,87 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test6:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test6:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4040
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test6:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test6:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4040
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test6:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3068,25 +3772,87 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test7:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test7:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc080
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test7:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test7:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc080
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test7:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3140,24 +3906,81 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffff8000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test8:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test8:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffff8000, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test8:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test8:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0xffff8000, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3214,25 +4037,87 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test9:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test9:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc200
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test9:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test9:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xc200
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test9:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
   %ldexp = fmul bfloat %x, %y
@@ -3289,25 +4174,87 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xdb80
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xdb80
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
   %ldexp = fmul bfloat %x, %y
@@ -3364,25 +4311,87 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4c00
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4c00
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
   %ldexp = fmul bfloat %x, %y

diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index ada3f017f45cf..45fe2d07226a1 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
 ; GCN-LABEL: uniform_vec_0_i16:
@@ -341,11 +342,17 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: divergent_vec_i16_LL:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_i16_LL:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_i16_LL:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> poison, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -518,11 +525,20 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: divergent_vec_i16_HH:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_i16_HH:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_i16_HH:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shift_a = lshr i32 %a, 16
   %tr_a = trunc i32 %shift_a to i16
   %shift_b = lshr i32 %b, 16
@@ -625,11 +641,17 @@ define float @divergent_vec_f16_LL(half %a, half %b) {
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: divergent_vec_f16_LL:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: divergent_vec_f16_LL:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: divergent_vec_f16_LL:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> poison, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half %b, i32 1
   %val = bitcast <2 x half> %vec to float

diff  --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 41b61f2e09a3d..fb20e72a77103 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -1,6 +1,6 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index ca2fca69dcf21..2d253c9484309 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
 
 ; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
 
@@ -28,16 +29,27 @@ define amdgpu_kernel void @kernel() #0 {
 }
 
 define amdgpu_cs void @with_local() #0 {
-; CHECK-LABEL: with_local:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT:    v_mov_b32_e32 v0, 13
-; CHECK-NEXT:    s_cmp_lg_u32 0, s33
-; CHECK-NEXT:    s_cmovk_i32 s33, 0x1c0
-; CHECK-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    s_alloc_vgpr 0
-; CHECK-NEXT:    s_endpgm
+; CHECK-TRUE16-LABEL: with_local:
+; CHECK-TRUE16:       ; %bb.0:
+; CHECK-TRUE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 13
+; CHECK-TRUE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-TRUE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-TRUE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    s_alloc_vgpr 0
+; CHECK-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: with_local:
+; CHECK-FAKE16:       ; %bb.0:
+; CHECK-FAKE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 13
+; CHECK-FAKE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-FAKE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-FAKE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    s_alloc_vgpr 0
+; CHECK-FAKE16-NEXT:    s_endpgm
   %local = alloca i32, addrspace(5)
   store volatile i8 13, ptr addrspace(5) %local
   ret void
@@ -46,21 +58,37 @@ define amdgpu_cs void @with_local() #0 {
 ; Check that we generate s_cselect for SP if we can fit
 ; the offset in an inline constant.
 define amdgpu_cs void @with_calls_inline_const() #0 {
-; CHECK-LABEL: with_calls_inline_const:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT:    v_mov_b32_e32 v0, 15
-; CHECK-NEXT:    s_cmp_lg_u32 0, s33
-; CHECK-NEXT:    s_mov_b32 s1, callee at abs32@hi
-; CHECK-NEXT:    s_cmovk_i32 s33, 0x1c0
-; CHECK-NEXT:    s_mov_b32 s0, callee at abs32@lo
-; CHECK-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
-; CHECK-NEXT:    s_cselect_b32 s32, 0x1d0, 16
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; CHECK-NEXT:    s_alloc_vgpr 0
-; CHECK-NEXT:    s_endpgm
+; CHECK-TRUE16-LABEL: with_calls_inline_const:
+; CHECK-TRUE16:       ; %bb.0:
+; CHECK-TRUE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 15
+; CHECK-TRUE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-TRUE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-TRUE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-TRUE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-TRUE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-TRUE16-NEXT:    s_cselect_b32 s32, 0x1d0, 16
+; CHECK-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-TRUE16-NEXT:    s_alloc_vgpr 0
+; CHECK-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: with_calls_inline_const:
+; CHECK-FAKE16:       ; %bb.0:
+; CHECK-FAKE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 15
+; CHECK-FAKE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-FAKE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-FAKE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-FAKE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-FAKE16-NEXT:    s_cselect_b32 s32, 0x1d0, 16
+; CHECK-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-FAKE16-NEXT:    s_alloc_vgpr 0
+; CHECK-FAKE16-NEXT:    s_endpgm
   %local = alloca i32, addrspace(5)
   store volatile i8 15, ptr addrspace(5) %local
   call amdgpu_gfx void @callee(i32 71)
@@ -70,22 +98,39 @@ define amdgpu_cs void @with_calls_inline_const() #0 {
 ; Check that we generate s_mov + s_cmovk if we can't
 ; fit the offset for SP in an inline constant.
 define amdgpu_cs void @with_calls_no_inline_const() #0 {
-; CHECK-LABEL: with_calls_no_inline_const:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT:    v_mov_b32_e32 v0, 15
-; CHECK-NEXT:    s_cmp_lg_u32 0, s33
-; CHECK-NEXT:    s_mov_b32 s1, callee at abs32@hi
-; CHECK-NEXT:    s_cmovk_i32 s33, 0x1c0
-; CHECK-NEXT:    s_mov_b32 s0, callee at abs32@lo
-; CHECK-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
-; CHECK-NEXT:    s_movk_i32 s32, 0x100
-; CHECK-NEXT:    s_cmovk_i32 s32, 0x2c0
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; CHECK-NEXT:    s_alloc_vgpr 0
-; CHECK-NEXT:    s_endpgm
+; CHECK-TRUE16-LABEL: with_calls_no_inline_const:
+; CHECK-TRUE16:       ; %bb.0:
+; CHECK-TRUE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 15
+; CHECK-TRUE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-TRUE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-TRUE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-TRUE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-TRUE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-TRUE16-NEXT:    s_movk_i32 s32, 0x100
+; CHECK-TRUE16-NEXT:    s_cmovk_i32 s32, 0x2c0
+; CHECK-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-TRUE16-NEXT:    s_alloc_vgpr 0
+; CHECK-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: with_calls_no_inline_const:
+; CHECK-FAKE16:       ; %bb.0:
+; CHECK-FAKE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 15
+; CHECK-FAKE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-FAKE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-FAKE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-FAKE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-FAKE16-NEXT:    s_movk_i32 s32, 0x100
+; CHECK-FAKE16-NEXT:    s_cmovk_i32 s32, 0x2c0
+; CHECK-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-FAKE16-NEXT:    s_alloc_vgpr 0
+; CHECK-FAKE16-NEXT:    s_endpgm
   %local = alloca i32, i32 61, addrspace(5)
   store volatile i8 15, ptr addrspace(5) %local
   call amdgpu_gfx void @callee(i32 71)
@@ -135,32 +180,54 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
 }
 
 define amdgpu_cs void @frame_pointer_none() #1 {
-; CHECK-LABEL: frame_pointer_none:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT:    v_mov_b32_e32 v0, 13
-; CHECK-NEXT:    s_cmp_lg_u32 0, s33
-; CHECK-NEXT:    s_cmovk_i32 s33, 0x1c0
-; CHECK-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    s_alloc_vgpr 0
-; CHECK-NEXT:    s_endpgm
+; CHECK-TRUE16-LABEL: frame_pointer_none:
+; CHECK-TRUE16:       ; %bb.0:
+; CHECK-TRUE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 13
+; CHECK-TRUE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-TRUE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-TRUE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    s_alloc_vgpr 0
+; CHECK-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: frame_pointer_none:
+; CHECK-FAKE16:       ; %bb.0:
+; CHECK-FAKE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 13
+; CHECK-FAKE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-FAKE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-FAKE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    s_alloc_vgpr 0
+; CHECK-FAKE16-NEXT:    s_endpgm
   %local = alloca i32, addrspace(5)
   store volatile i8 13, ptr addrspace(5) %local
   ret void
 }
 
 define amdgpu_cs void @frame_pointer_all() #2 {
-; CHECK-LABEL: frame_pointer_all:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
-; CHECK-NEXT:    v_mov_b32_e32 v0, 13
-; CHECK-NEXT:    s_cmp_lg_u32 0, s33
-; CHECK-NEXT:    s_cmovk_i32 s33, 0x1c0
-; CHECK-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    s_alloc_vgpr 0
-; CHECK-NEXT:    s_endpgm
+; CHECK-TRUE16-LABEL: frame_pointer_all:
+; CHECK-TRUE16:       ; %bb.0:
+; CHECK-TRUE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 13
+; CHECK-TRUE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-TRUE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-TRUE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    s_alloc_vgpr 0
+; CHECK-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: frame_pointer_all:
+; CHECK-FAKE16:       ; %bb.0:
+; CHECK-FAKE16-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 13
+; CHECK-FAKE16-NEXT:    s_cmp_lg_u32 0, s33
+; CHECK-FAKE16-NEXT:    s_cmovk_i32 s33, 0x1c0
+; CHECK-FAKE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    s_alloc_vgpr 0
+; CHECK-FAKE16-NEXT:    s_endpgm
   %local = alloca i32, addrspace(5)
   store volatile i8 13, ptr addrspace(5) %local
   ret void
@@ -168,18 +235,31 @@ define amdgpu_cs void @frame_pointer_all() #2 {
 
 ; Non-entry functions and graphics shaders don't need to worry about CWSR.
 define amdgpu_gs void @amdgpu_gs() #0 {
-; CHECK-LABEL: amdgpu_gs:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_mov_b32_e32 v0, 15
-; CHECK-NEXT:    s_mov_b32 s1, callee at abs32@hi
-; CHECK-NEXT:    s_mov_b32 s0, callee at abs32@lo
-; CHECK-NEXT:    s_mov_b32 s32, 16
-; CHECK-NEXT:    scratch_store_b8 off, v0, off scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; CHECK-NEXT:    s_alloc_vgpr 0
-; CHECK-NEXT:    s_endpgm
+; CHECK-TRUE16-LABEL: amdgpu_gs:
+; CHECK-TRUE16:       ; %bb.0:
+; CHECK-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 15
+; CHECK-TRUE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-TRUE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-TRUE16-NEXT:    s_mov_b32 s32, 16
+; CHECK-TRUE16-NEXT:    scratch_store_b8 off, v0, off scope:SCOPE_SYS
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-TRUE16-NEXT:    s_alloc_vgpr 0
+; CHECK-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: amdgpu_gs:
+; CHECK-FAKE16:       ; %bb.0:
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 15
+; CHECK-FAKE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-FAKE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-FAKE16-NEXT:    s_mov_b32 s32, 16
+; CHECK-FAKE16-NEXT:    scratch_store_b8 off, v0, off scope:SCOPE_SYS
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-FAKE16-NEXT:    s_alloc_vgpr 0
+; CHECK-FAKE16-NEXT:    s_endpgm
   %local = alloca i32, addrspace(5)
   store volatile i8 15, ptr addrspace(5) %local
   call amdgpu_gfx void @callee(i32 71)
@@ -187,45 +267,85 @@ define amdgpu_gs void @amdgpu_gs() #0 {
 }
 
 define amdgpu_gfx void @amdgpu_gfx() #0 {
-; CHECK-LABEL: amdgpu_gfx:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    s_wait_expcnt 0x0
-; CHECK-NEXT:    s_wait_samplecnt 0x0
-; CHECK-NEXT:    s_wait_bvhcnt 0x0
-; CHECK-NEXT:    s_wait_kmcnt 0x0
-; CHECK-NEXT:    s_mov_b32 s0, s33
-; CHECK-NEXT:    s_mov_b32 s33, s32
-; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
-; CHECK-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    s_mov_b32 exec_lo, s1
-; CHECK-NEXT:    v_writelane_b32 v40, s0, 2
-; CHECK-NEXT:    v_mov_b32_e32 v0, 15
-; CHECK-NEXT:    s_mov_b32 s1, callee at abs32@hi
-; CHECK-NEXT:    s_mov_b32 s0, callee at abs32@lo
-; CHECK-NEXT:    s_add_co_i32 s32, s32, 16
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0x47
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    s_mov_b32 s32, s33
-; CHECK-NEXT:    v_readlane_b32 s0, v40, 2
-; CHECK-NEXT:    s_or_saveexec_b32 s1, -1
-; CHECK-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    s_mov_b32 exec_lo, s1
-; CHECK-NEXT:    s_mov_b32 s33, s0
-; CHECK-NEXT:    s_wait_loadcnt 0x0
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-TRUE16-LABEL: amdgpu_gfx:
+; CHECK-TRUE16:       ; %bb.0:
+; CHECK-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-TRUE16-NEXT:    s_wait_expcnt 0x0
+; CHECK-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; CHECK-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; CHECK-TRUE16-NEXT:    s_mov_b32 s0, s33
+; CHECK-TRUE16-NEXT:    s_mov_b32 s33, s32
+; CHECK-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; CHECK-TRUE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; CHECK-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 15
+; CHECK-TRUE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-TRUE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-TRUE16-NEXT:    s_add_co_i32 s32, s32, 16
+; CHECK-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-TRUE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-TRUE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-TRUE16-NEXT:    s_mov_b32 s32, s33
+; CHECK-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; CHECK-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; CHECK-TRUE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-TRUE16-NEXT:    s_mov_b32 s33, s0
+; CHECK-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; CHECK-TRUE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; CHECK-FAKE16-LABEL: amdgpu_gfx:
+; CHECK-FAKE16:       ; %bb.0:
+; CHECK-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-FAKE16-NEXT:    s_wait_expcnt 0x0
+; CHECK-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; CHECK-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s0, s33
+; CHECK-FAKE16-NEXT:    s_mov_b32 s33, s32
+; CHECK-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; CHECK-FAKE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 15
+; CHECK-FAKE16-NEXT:    s_mov_b32 s1, callee at abs32@hi
+; CHECK-FAKE16-NEXT:    s_mov_b32 s0, callee at abs32@lo
+; CHECK-FAKE16-NEXT:    s_add_co_i32 s32, s32, 16
+; CHECK-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-FAKE16-NEXT:    s_wait_storecnt 0x0
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x47
+; CHECK-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-FAKE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; CHECK-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s32, s33
+; CHECK-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; CHECK-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; CHECK-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; CHECK-FAKE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; CHECK-FAKE16-NEXT:    s_mov_b32 s33, s0
+; CHECK-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; CHECK-FAKE16-NEXT:    s_wait_alu 0xfffe
+; CHECK-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %local = alloca i32, addrspace(5)
   store volatile i8 15, ptr addrspace(5) %local
   call amdgpu_gfx void @callee(i32 71)

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index af7028173f6c7..dbbe43152e0df 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i32 inreg %cond.arg) {
 ; SI-LABEL: vec_8xi16_extract_4xi16:
@@ -116,38 +117,66 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:    s_branch .LBB0_2
 ;
-; GFX11-LABEL: vec_8xi16_extract_4xi16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB0_4
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB0_3
-; GFX11-NEXT:  .LBB0_2: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB0_3: ; %exit
-; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB0_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX11-NEXT:    s_branch .LBB0_2
+; GFX11-TRUE16-LABEL: vec_8xi16_extract_4xi16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB0_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB0_3
+; GFX11-TRUE16-NEXT:  .LBB0_2: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB0_3: ; %exit
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB0_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-TRUE16-NEXT:    s_branch .LBB0_2
+;
+; GFX11-FAKE16-LABEL: vec_8xi16_extract_4xi16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB0_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB0_3
+; GFX11-FAKE16-NEXT:  .LBB0_2: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB0_3: ; %exit
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB0_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-FAKE16-NEXT:    s_branch .LBB0_2
   %cond = icmp eq i32 %cond.arg, 0
   br i1 %cond, label %T, label %F
 
@@ -282,38 +311,66 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:    s_branch .LBB1_2
 ;
-; GFX11-LABEL: vec_8xi16_extract_4xi16_2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB1_4
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB1_3
-; GFX11-NEXT:  .LBB1_2: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB1_3: ; %exit
-; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB1_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX11-NEXT:    s_branch .LBB1_2
+; GFX11-TRUE16-LABEL: vec_8xi16_extract_4xi16_2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB1_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB1_3
+; GFX11-TRUE16-NEXT:  .LBB1_2: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB1_3: ; %exit
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB1_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-TRUE16-NEXT:    s_branch .LBB1_2
+;
+; GFX11-FAKE16-LABEL: vec_8xi16_extract_4xi16_2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB1_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB1_3
+; GFX11-FAKE16-NEXT:  .LBB1_2: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB1_3: ; %exit
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB1_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-FAKE16-NEXT:    s_branch .LBB1_2
   %cond = icmp eq i32 %cond.arg, 0
   br i1 %cond, label %T, label %F
 
@@ -451,39 +508,73 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:    s_branch .LBB2_2
 ;
-; GFX11-LABEL: vec_8xf16_extract_4xf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB2_4
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB2_3
-; GFX11-NEXT:  .LBB2_2: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB2_3: ; %exit
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
-; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB2_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
-; GFX11-NEXT:    s_branch .LBB2_2
+; GFX11-TRUE16-LABEL: vec_8xf16_extract_4xf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB2_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB2_3
+; GFX11-TRUE16-NEXT:  .LBB2_2: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB2_3: ; %exit
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3d00
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3900, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3900, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, 0x3900, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x3900, v0.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v2.l, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB2_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-TRUE16-NEXT:    s_branch .LBB2_2
+;
+; GFX11-FAKE16-LABEL: vec_8xf16_extract_4xf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB2_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB2_3
+; GFX11-FAKE16-NEXT:  .LBB2_2: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB2_3: ; %exit
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3d00
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
+; GFX11-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v2, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB2_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-FAKE16-NEXT:    s_branch .LBB2_2
   %cond = icmp eq i32 %cond.arg, 0
   br i1 %cond, label %T, label %F
 
@@ -656,42 +747,74 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:    s_branch .LBB3_2
 ;
-; GFX11-LABEL: vec_16xi16_extract_4xi16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB3_4
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB3_3
-; GFX11-NEXT:  .LBB3_2: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB3_3: ; %exit
-; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB3_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:    s_branch .LBB3_2
+; GFX11-TRUE16-LABEL: vec_16xi16_extract_4xi16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB3_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB3_3
+; GFX11-TRUE16-NEXT:  .LBB3_2: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB3_3: ; %exit
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB3_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-TRUE16-NEXT:    s_branch .LBB3_2
+;
+; GFX11-FAKE16-LABEL: vec_16xi16_extract_4xi16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB3_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB3_3
+; GFX11-FAKE16-NEXT:  .LBB3_2: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB3_3: ; %exit
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB3_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-FAKE16-NEXT:    s_branch .LBB3_2
   %cond = icmp eq i32 %cond.arg, 0
   br i1 %cond, label %T, label %F
 
@@ -866,42 +989,74 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:    s_branch .LBB4_2
 ;
-; GFX11-LABEL: vec_16xi16_extract_4xi16_2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB4_4
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB4_3
-; GFX11-NEXT:  .LBB4_2: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB4_3: ; %exit
-; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB4_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:    s_branch .LBB4_2
+; GFX11-TRUE16-LABEL: vec_16xi16_extract_4xi16_2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB4_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX11-TRUE16-NEXT:  .LBB4_2: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB4_3: ; %exit
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.h, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, 0x8000, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB4_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-TRUE16-NEXT:    s_branch .LBB4_2
+;
+; GFX11-FAKE16-LABEL: vec_16xi16_extract_4xi16_2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB4_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB4_3
+; GFX11-FAKE16-NEXT:  .LBB4_2: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB4_3: ; %exit
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB4_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-FAKE16-NEXT:    s_branch .LBB4_2
   %cond = icmp eq i32 %cond.arg, 0
   br i1 %cond, label %T, label %F
 
@@ -1079,43 +1234,81 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:    s_branch .LBB5_2
 ;
-; GFX11-LABEL: vec_16xf16_extract_4xf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB5_4
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB5_3
-; GFX11-NEXT:  .LBB5_2: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB5_3: ; %exit
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
-; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-; GFX11-NEXT:  .LBB5_4:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:    s_branch .LBB5_2
+; GFX11-TRUE16-LABEL: vec_16xf16_extract_4xf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB5_4
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB5_3
+; GFX11-TRUE16-NEXT:  .LBB5_2: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB5_3: ; %exit
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3d00
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3900, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3900, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, 0x3900, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x3900, v0.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v2.l, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-NEXT:  .LBB5_4:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-TRUE16-NEXT:    s_branch .LBB5_2
+;
+; GFX11-FAKE16-LABEL: vec_16xf16_extract_4xf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB5_4
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB5_3
+; GFX11-FAKE16-NEXT:  .LBB5_2: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB5_3: ; %exit
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3d00
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
+; GFX11-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v2, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-NEXT:  .LBB5_4:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-FAKE16-NEXT:    s_branch .LBB5_2
   %cond = icmp eq i32 %cond.arg, 0
   br i1 %cond, label %T, label %F
 
@@ -1426,62 +1619,108 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX9-NEXT:    v_perm_b32 v3, v7, v3, s34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: vec_16xi16_extract_8xi16_0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_u8 v4, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB7_2
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB7_3
-; GFX11-NEXT:    s_branch .LBB7_4
-; GFX11-NEXT:  .LBB7_2:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:  .LBB7_3: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB7_4: ; %exit
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8
-; GFX11-NEXT:    v_perm_b32 v2, v7, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: vec_16xi16_extract_8xi16_0:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v4, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB7_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB7_3
+; GFX11-TRUE16-NEXT:    s_branch .LBB7_4
+; GFX11-TRUE16-NEXT:  .LBB7_2:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+; GFX11-TRUE16-NEXT:  .LBB7_3: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB7_4: ; %exit
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0x3900
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e64 s0, 0x3801, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_u16_e64 s1, 0x3800, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e64 s2, 0x3801, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e64 s3, 0x3801, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, 0x3d00, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e64 s0, 0x3801, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_u16_e64 s34, 0x3801, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, 0x3d00, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, 0x3d00, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.h, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, 0x3d00, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, 0x3d00, s34
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, 0x3d00, v3.h, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: vec_16xi16_extract_8xi16_0:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v4, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB7_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB7_3
+; GFX11-FAKE16-NEXT:    s_branch .LBB7_4
+; GFX11-FAKE16-NEXT:  .LBB7_2:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-FAKE16-NEXT:  .LBB7_3: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB7_4: ; %exit
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, 0x3900
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0x3d00
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v7, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   br i1 %cond, label %T, label %F
 
 T:
@@ -1717,62 +1956,114 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX9-NEXT:    v_pack_b32_f16 v3, v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: vec_16xf16_extract_8xf16_0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_u8 v4, off, s32
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
-; GFX11-NEXT:    s_cbranch_scc0 .LBB8_2
-; GFX11-NEXT:  ; %bb.1: ; %F
-; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB8_3
-; GFX11-NEXT:    s_branch .LBB8_4
-; GFX11-NEXT:  .LBB8_2:
-; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-NEXT:  .LBB8_3: ; %T
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:  .LBB8_4: ; %exit
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v8
-; GFX11-NEXT:    v_pack_b32_f16 v2, v4, v7
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v3, v5, v6
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: vec_16xf16_extract_8xf16_0:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v4, off, s32
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB8_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB8_3
+; GFX11-TRUE16-NEXT:    s_branch .LBB8_4
+; GFX11-TRUE16-NEXT:  .LBB8_2:
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-TRUE16-NEXT:  .LBB8_3: ; %T
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:  .LBB8_4: ; %exit
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3d00
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v5.h
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s3, 0.5, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, 0x3900, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, 0x3900, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s34, 0.5, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3900, v0.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0x3900, v0.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0x3900, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3900, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, 0x3900, v0.l, s34
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v0.l, 0x3900, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v5.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v4.l, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: vec_16xf16_extract_8xf16_0:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v4, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, vcc_lo, exec_lo
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB8_2
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %F
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB8_3
+; GFX11-FAKE16-NEXT:    s_branch .LBB8_4
+; GFX11-FAKE16-NEXT:  .LBB8_2:
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-FAKE16-NEXT:  .LBB8_3: ; %T
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:  .LBB8_4: ; %exit
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, 0x3900
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0x3d00
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v8
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v2, v4, v7
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v3, v5, v6
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   br i1 %cond, label %T, label %F
 
 T:

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 9919497acea73..bc541043f1fab 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.canonicalize.f32(float) #0
@@ -2401,31 +2403,57 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_canonicalize_value_f16_flush:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: test_canonicalize_value_f16_flush:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_canonicalize_value_f16_flush:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_canonicalize_value_f16_flush:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: test_canonicalize_value_f16_flush:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_canonicalize_value_f16_flush:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
   %v = load half, ptr addrspace(1) %gep, align 2
@@ -2757,31 +2785,57 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_canonicalize_value_f16_denorm:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: test_canonicalize_value_f16_denorm:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_canonicalize_value_f16_denorm:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_canonicalize_value_f16_denorm:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: test_canonicalize_value_f16_denorm:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_canonicalize_value_f16_denorm:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
   %v = load half, ptr addrspace(1) %gep, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 3156a1280afea..1d83d33a4f832 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -3,7 +3,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_kernel void @fcmp_f16_lt(
 ; SI-LABEL: fcmp_f16_lt:
@@ -106,30 +107,55 @@ define amdgpu_kernel void @fcmp_f16_lt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_lt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_lt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_lt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -245,33 +271,61 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_lt_abs:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_lt_abs:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_lt_abs:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -387,30 +441,55 @@ define amdgpu_kernel void @fcmp_f16_eq(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_eq:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_eq:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_eq:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -524,30 +603,55 @@ define amdgpu_kernel void @fcmp_f16_le(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_le:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_le:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_le:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -661,30 +765,55 @@ define amdgpu_kernel void @fcmp_f16_gt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_gt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_gt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_gt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -798,30 +927,55 @@ define amdgpu_kernel void @fcmp_f16_lg(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_lg:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_lg:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_lg:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -935,30 +1089,55 @@ define amdgpu_kernel void @fcmp_f16_ge(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_ge:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_ge:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_ge:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -1072,30 +1251,55 @@ define amdgpu_kernel void @fcmp_f16_o(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_o:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_o:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_o:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -1209,30 +1413,55 @@ define amdgpu_kernel void @fcmp_f16_u(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_u:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_u:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_u:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -1346,30 +1575,55 @@ define amdgpu_kernel void @fcmp_f16_nge(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_nge:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_nge:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_nge:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -1483,30 +1737,55 @@ define amdgpu_kernel void @fcmp_f16_nlg(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_nlg:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_nlg:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_nlg:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -1620,30 +1899,55 @@ define amdgpu_kernel void @fcmp_f16_ngt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_ngt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_ngt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_ngt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -1757,30 +2061,55 @@ define amdgpu_kernel void @fcmp_f16_nle(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_nle:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_nle:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_nle:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -1894,30 +2223,55 @@ define amdgpu_kernel void @fcmp_f16_neq(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_neq:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_neq:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_neq:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -2031,30 +2385,55 @@ define amdgpu_kernel void @fcmp_f16_nlt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_f16_nlt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_f16_nlt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_f16_nlt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -2188,36 +2567,67 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_lt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_lt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_lt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -2352,36 +2762,67 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_eq:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_eq:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_eq:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -2515,36 +2956,67 @@ define amdgpu_kernel void @fcmp_v2f16_le(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_le:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_le:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_le:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -2678,36 +3150,67 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_gt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_gt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_gt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -2842,36 +3345,67 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_lg:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_lg:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_lg:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -3006,36 +3540,67 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_ge:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_ge:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_ge:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -3170,36 +3735,67 @@ define amdgpu_kernel void @fcmp_v2f16_o(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_o:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_o:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_o:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -3334,36 +3930,67 @@ define amdgpu_kernel void @fcmp_v2f16_u(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_u:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_u_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_u_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_u:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_u:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -3497,36 +4124,67 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_nge:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_nge:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_nge:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -3660,36 +4318,67 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_nlg:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_nlg:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_nlg:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -3824,36 +4513,67 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_ngt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_ngt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_ngt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -3987,36 +4707,67 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_nle:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_nle:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_nle:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -4150,36 +4901,67 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_neq:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_neq:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_neq:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -4313,36 +5095,67 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
 ; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: fcmp_v2f16_nlt:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s6, s10
-; GFX12-NEXT:    s_mov_b32 s7, s11
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    buffer_load_b32 v0, off, s[4:7], null
-; GFX12-NEXT:    buffer_load_b32 v1, off, s[12:15], null
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v0
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX12-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: fcmp_v2f16_nlt:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: fcmp_v2f16_nlt:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v0
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX12-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b) {
@@ -4359,3 +5172,5 @@ declare half @llvm.fabs.f16(half) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index fbcdbed338e60..86ebf3fc3e9de 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmax3_olt_0_f32:
@@ -423,65 +425,125 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fmax3_olt_0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
-; GFX11-NEXT:    v_max3_f16 v0, v0, v1, v2
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_fmax3_olt_0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: test_fmax3_olt_0_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s18, s10
-; GFX12-NEXT:    s_mov_b32 s19, s11
-; GFX12-NEXT:    s_mov_b32 s22, s10
-; GFX12-NEXT:    s_mov_b32 s23, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    s_mov_b32 s16, s4
-; GFX12-NEXT:    s_mov_b32 s17, s5
-; GFX12-NEXT:    s_mov_b32 s20, s6
-; GFX12-NEXT:    s_mov_b32 s21, s7
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_max3_num_f16 v0, v0, v1, v2
-; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX11-FAKE16-LABEL: test_fmax3_olt_0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT:    v_max3_f16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: test_fmax3_olt_0_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_fmax3_olt_0_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v1, v2
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -591,65 +653,125 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fmax3_olt_1_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
-; GFX11-NEXT:    v_max3_f16 v0, v2, v0, v1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_fmax3_olt_1_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: test_fmax3_olt_1_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s18, s10
-; GFX12-NEXT:    s_mov_b32 s19, s11
-; GFX12-NEXT:    s_mov_b32 s22, s10
-; GFX12-NEXT:    s_mov_b32 s23, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    s_mov_b32 s16, s4
-; GFX12-NEXT:    s_mov_b32 s17, s5
-; GFX12-NEXT:    s_mov_b32 s20, s6
-; GFX12-NEXT:    s_mov_b32 s21, s7
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_max3_num_f16 v0, v2, v0, v1
-; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX11-FAKE16-LABEL: test_fmax3_olt_1_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT:    v_max3_f16 v0, v2, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: test_fmax3_olt_1_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_fmax3_olt_1_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_max3_num_f16 v0, v2, v0, v1
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index fe8150b3c21c4..0adbecd952ae3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
 ; GCN-LABEL: test_fmaximum_f32_vv:
@@ -110,10 +112,25 @@ define amdgpu_ps <16 x float> @test_fmaximum_v16f32(<16 x float> %a, <16 x float
 }
 
 define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
-; GCN-LABEL: test_fmaximum_f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-TRUE16-LABEL: test_fmaximum_f16_vv:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: test_fmaximum_f16_vv:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: test_fmaximum_f16_vv:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fmaximum_f16_vv:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call half @llvm.maximum.f16(half %a, half %b)
   ret half %val
 }
@@ -154,11 +171,17 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: test_fmaximum_v3f16_vv:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_maximum_f16 v1, v1, v3
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    v_maximum_f16 v1.l, v1.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fmaximum_v3f16_vv:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_maximum_f16 v1, v1, v3
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
@@ -282,20 +305,65 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fmaximum_f16_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_maximum_f16 v1, v1, v2
-; GCN-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: fmaximum_f16_move_to_valu:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: fmaximum_f16_move_to_valu:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_clause 0x1
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v1, v1, v2
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: fmaximum_f16_move_to_valu:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5] scope:SCOPE_SYS
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-FAKE16-LABEL: fmaximum_f16_move_to_valu:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_maximum_f16 v1, v1, v2
+; GFX12-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 4
   %b = load volatile half, ptr addrspace(1) %bptr, align 4
   %v = call half @llvm.maximum.f16(half %a, half %b)

diff  --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 2a372dffce650..1b8a79ee982d1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
 
@@ -1242,15 +1243,25 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 
 
 define half @v_fmaximum3_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16:
 ; GFX942:       ; %bb.0:
@@ -1277,15 +1288,25 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_commute:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v2, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_commute:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v2.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_commute:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v2, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_commute:
 ; GFX942:       ; %bb.0:
@@ -1312,16 +1333,27 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
 }
 
 define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %c) {
-; GFX12-LABEL: s_fmaximum3_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_maximum3_f16 v0, s0, s1, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-TRUE16-LABEL: s_fmaximum3_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, s0, s1, v0.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: s_fmaximum3_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, s0, s1, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX942-LABEL: s_fmaximum3_f16:
 ; GFX942:       ; %bb.0:
@@ -1359,15 +1391,25 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
 }
 
 define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fabs0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fabs0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, |v0.l|, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fabs0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, |v0|, v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fabs0:
 ; GFX942:       ; %bb.0:
@@ -1396,15 +1438,25 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fabs1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, |v1|, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fabs1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, |v1.l|, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fabs1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, |v1|, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fabs1:
 ; GFX942:       ; %bb.0:
@@ -1433,15 +1485,25 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fabs2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, |v2|
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fabs2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, v1.l, |v2.l|
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fabs2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, v1, |v2|
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fabs2:
 ; GFX942:       ; %bb.0:
@@ -1470,15 +1532,25 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fabs_all:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, |v1|, |v2|
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, |v0.l|, |v1.l|, |v2.l|
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fabs_all:
 ; GFX942:       ; %bb.0:
@@ -1511,15 +1583,25 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fneg_all:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, -v0, -v1, -v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, -v0.l, -v1.l, -v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, -v0, -v1, -v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fneg_all:
 ; GFX942:       ; %bb.0:
@@ -1552,15 +1634,25 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fneg_fabs_all:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, -|v0.l|, -|v1.l|, -|v2.l|
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fneg_fabs_all:
 ; GFX942:       ; %bb.0:
@@ -1596,15 +1688,25 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fneg0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, -v0, v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fneg0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, -v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fneg0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, -v0, v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fneg0:
 ; GFX942:       ; %bb.0:
@@ -1633,15 +1735,25 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fneg1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, -v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fneg1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fneg1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, -v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fneg1:
 ; GFX942:       ; %bb.0:
@@ -1670,15 +1782,25 @@ define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_fneg2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, -v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_fneg2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_fneg2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, v1, -v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_fneg2:
 ; GFX942:       ; %bb.0:
@@ -1707,15 +1829,25 @@ define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16_const0(half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_const0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, 0x4800, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_const0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, 0x4800, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_const0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, 0x4800, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_const0:
 ; GFX942:       ; %bb.0:
@@ -1743,15 +1875,25 @@ define half @v_fmaximum3_f16_const0(half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16__const2(half %a, half %b) {
-; GFX12-LABEL: v_fmaximum3_f16__const2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 0x4800
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16__const2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, v1.l, 0x4800
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16__const2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, v1, 0x4800
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16__const2:
 ; GFX942:       ; %bb.0:
@@ -1779,15 +1921,25 @@ define half @v_fmaximum3_f16__const2(half %a, half %b) {
 }
 
 define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
-; GFX12-LABEL: v_fmaximum3_f16_inlineimm0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, 4.0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, 4.0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_inlineimm0:
 ; GFX942:       ; %bb.0:
@@ -1814,15 +1966,25 @@ define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
 }
 
 define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
-; GFX12-LABEL: v_fmaximum3_f16__inlineimm:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 4.0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, v1.l, 4.0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, v1, 4.0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16__inlineimm:
 ; GFX942:       ; %bb.0:
@@ -1849,17 +2011,29 @@ define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
 }
 
 define half @v_fmaximum3_f16_const1_const2(half %a) {
-; GFX12-LABEL: v_fmaximum3_f16_const1_const2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_movk_i32 s0, 0x4800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x4c00
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_maximum3_f16 v0.l, v0.l, 0x4800, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fmaximum3_f16_const1_const2:
 ; GFX942:       ; %bb.0:
@@ -3700,18 +3874,31 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float
 }
 
 define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
-; GFX12-LABEL: v_no_fmaximum3_f16__multi_use:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_maximum_f16 v1, v0, v2
-; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_maximum_f16 v0.h, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_no_fmaximum3_f16__multi_use:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_maximum_f16 v1, v0, v2
+; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_no_fmaximum3_f16__multi_use:
 ; GFX942:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 269fd52df5c49..d554707027bb1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 {
 ; SI-LABEL: test_fmin3_olt_0_f32:
@@ -423,65 +425,125 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fmin3_olt_0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
-; GFX11-NEXT:    v_min3_f16 v0, v0, v1, v2
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_fmin3_olt_0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: test_fmin3_olt_0_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s18, s10
-; GFX12-NEXT:    s_mov_b32 s19, s11
-; GFX12-NEXT:    s_mov_b32 s22, s10
-; GFX12-NEXT:    s_mov_b32 s23, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    s_mov_b32 s16, s4
-; GFX12-NEXT:    s_mov_b32 s17, s5
-; GFX12-NEXT:    s_mov_b32 s20, s6
-; GFX12-NEXT:    s_mov_b32 s21, s7
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_min3_num_f16 v0, v0, v1, v2
-; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX11-FAKE16-LABEL: test_fmin3_olt_0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT:    v_min3_f16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: test_fmin3_olt_0_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_fmin3_olt_0_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v1, v2
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2
@@ -591,65 +653,125 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fmin3_olt_1_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
-; GFX11-NEXT:    v_min3_f16 v0, v2, v0, v1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_fmin3_olt_1_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: test_fmin3_olt_1_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX12-NEXT:    s_mov_b32 s10, -1
-; GFX12-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX12-NEXT:    s_mov_b32 s14, s10
-; GFX12-NEXT:    s_mov_b32 s15, s11
-; GFX12-NEXT:    s_mov_b32 s18, s10
-; GFX12-NEXT:    s_mov_b32 s19, s11
-; GFX12-NEXT:    s_mov_b32 s22, s10
-; GFX12-NEXT:    s_mov_b32 s23, s11
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s12, s2
-; GFX12-NEXT:    s_mov_b32 s13, s3
-; GFX12-NEXT:    s_mov_b32 s16, s4
-; GFX12-NEXT:    s_mov_b32 s17, s5
-; GFX12-NEXT:    s_mov_b32 s20, s6
-; GFX12-NEXT:    s_mov_b32 s21, s7
-; GFX12-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s8, s0
-; GFX12-NEXT:    s_mov_b32 s9, s1
-; GFX12-NEXT:    v_min3_num_f16 v0, v2, v0, v1
-; GFX12-NEXT:    buffer_store_b16 v0, off, s[8:11], null
-; GFX12-NEXT:    s_endpgm
+; GFX11-FAKE16-LABEL: test_fmin3_olt_1_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT:    v_min3_f16 v0, v2, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: test_fmin3_olt_1_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-TRUE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_fmin3_olt_1_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX12-FAKE16-NEXT:    s_mov_b32 s22, s10
+; GFX12-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX12-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX12-FAKE16-NEXT:    s_mov_b32 s20, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s21, s7
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX12-FAKE16-NEXT:    v_min3_num_f16 v0, v2, v0, v1
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
   %c = load volatile half, ptr addrspace(1) %cptr, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index ba536aade8c49..e1d35b52defee 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
 ; GCN-LABEL: test_fminimum_f32_vv:
@@ -110,10 +112,25 @@ define amdgpu_ps <16 x float> @test_fminimum_v16f32(<16 x float> %a, <16 x float
 }
 
 define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
-; GCN-LABEL: test_fminimum_f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-TRUE16-LABEL: test_fminimum_f16_vv:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: test_fminimum_f16_vv:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: test_fminimum_f16_vv:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fminimum_f16_vv:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call half @llvm.minimum.f16(half %a, half %b)
   ret half %val
 }
@@ -154,11 +171,17 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: test_fminimum_v3f16_vv:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v1, v3
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: test_fminimum_v3f16_vv:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.l, v1.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fminimum_v3f16_vv:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v1, v3
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %val = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
   ret <3 x half> %val
 }
@@ -282,20 +305,65 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fminimum_f16_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_minimum_f16 v1, v1, v2
-; GCN-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-TRUE16-LABEL: fminimum_f16_move_to_valu:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: fminimum_f16_move_to_valu:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_clause 0x1
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v1, v1, v2
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-TRUE16-LABEL: fminimum_f16_move_to_valu:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5] scope:SCOPE_SYS
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-GISEL-FAKE16-LABEL: fminimum_f16_move_to_valu:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-GISEL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v1, v2
+; GFX12-GISEL-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-GISEL-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 4
   %b = load volatile half, ptr addrspace(1) %bptr, align 4
   %v = call half @llvm.minimum.f16(half %a, half %b)

diff  --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 34d7e5acb7896..96e9aa375f5ee 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
 
@@ -1242,15 +1243,25 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 
 
 define half @v_fminimum3_f16(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16:
 ; GFX942:       ; %bb.0:
@@ -1277,15 +1288,25 @@ define half @v_fminimum3_f16(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_commute:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v2, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_commute:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v2.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_commute:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v2, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_commute:
 ; GFX942:       ; %bb.0:
@@ -1312,16 +1333,27 @@ define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
 }
 
 define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %c) {
-; GFX12-LABEL: s_fminimum3_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_minimum3_f16 v0, s0, s1, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-TRUE16-LABEL: s_fminimum3_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, s0, s1, v0.l
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: s_fminimum3_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, s0, s1, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX942-LABEL: s_fminimum3_f16:
 ; GFX942:       ; %bb.0:
@@ -1359,15 +1391,25 @@ define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %
 }
 
 define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fabs0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fabs0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, |v0.l|, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fabs0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, |v0|, v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fabs0:
 ; GFX942:       ; %bb.0:
@@ -1396,15 +1438,25 @@ define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fabs1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, |v1|, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fabs1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, |v1.l|, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fabs1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, |v1|, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fabs1:
 ; GFX942:       ; %bb.0:
@@ -1433,15 +1485,25 @@ define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fabs2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, |v2|
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fabs2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, v1.l, |v2.l|
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fabs2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, v1, |v2|
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fabs2:
 ; GFX942:       ; %bb.0:
@@ -1470,15 +1532,25 @@ define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fabs_all:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, |v1|, |v2|
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fabs_all:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, |v0.l|, |v1.l|, |v2.l|
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fabs_all:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fabs_all:
 ; GFX942:       ; %bb.0:
@@ -1511,15 +1583,25 @@ define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fneg_all:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, -v0, -v1, -v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fneg_all:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, -v0.l, -v1.l, -v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fneg_all:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, -v0, -v1, -v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fneg_all:
 ; GFX942:       ; %bb.0:
@@ -1552,15 +1634,25 @@ define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fneg_fabs_all:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, -|v0.l|, -|v1.l|, -|v2.l|
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fneg_fabs_all:
 ; GFX942:       ; %bb.0:
@@ -1596,15 +1688,25 @@ define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fneg0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, -v0, v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fneg0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, -v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fneg0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, -v0, v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fneg0:
 ; GFX942:       ; %bb.0:
@@ -1633,15 +1735,25 @@ define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fneg1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, -v1, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fneg1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fneg1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, -v1, v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fneg1:
 ; GFX942:       ; %bb.0:
@@ -1670,15 +1782,25 @@ define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_fneg2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, -v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_fneg2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_fneg2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, v1, -v2
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_fneg2:
 ; GFX942:       ; %bb.0:
@@ -1707,15 +1829,25 @@ define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
 }
 
 define half @v_fminimum3_f16_const0(half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_const0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, 0x4800, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_const0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, 0x4800, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_const0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, 0x4800, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_const0:
 ; GFX942:       ; %bb.0:
@@ -1743,15 +1875,25 @@ define half @v_fminimum3_f16_const0(half %b, half %c) {
 }
 
 define half @v_fminimum3_f16__const2(half %a, half %b) {
-; GFX12-LABEL: v_fminimum3_f16__const2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 0x4800
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16__const2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, v1.l, 0x4800
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16__const2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, v1, 0x4800
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16__const2:
 ; GFX942:       ; %bb.0:
@@ -1779,15 +1921,25 @@ define half @v_fminimum3_f16__const2(half %a, half %b) {
 }
 
 define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
-; GFX12-LABEL: v_fminimum3_f16_inlineimm0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, 4.0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, 4.0, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, 4.0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_inlineimm0:
 ; GFX942:       ; %bb.0:
@@ -1814,15 +1966,25 @@ define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
 }
 
 define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
-; GFX12-LABEL: v_fminimum3_f16__inlineimm:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 4.0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16__inlineimm:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, v1.l, 4.0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16__inlineimm:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, v1, 4.0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16__inlineimm:
 ; GFX942:       ; %bb.0:
@@ -1849,17 +2011,29 @@ define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
 }
 
 define half @v_fminimum3_f16_const1_const2(half %a) {
-; GFX12-LABEL: v_fminimum3_f16_const1_const2:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_movk_i32 s0, 0x4800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_fminimum3_f16_const1_const2:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x4c00
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_minimum3_f16 v0.l, v0.l, 0x4800, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_fminimum3_f16_const1_const2:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_fminimum3_f16_const1_const2:
 ; GFX942:       ; %bb.0:
@@ -3700,18 +3874,31 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float
 }
 
 define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
-; GFX12-LABEL: v_no_fminimum3_f16__multi_use:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_minimum_f16 v1, v0, v2
-; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_minimum_f16 v0.h, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_no_fminimum3_f16__multi_use:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_minimum_f16 v1, v0, v2
+; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: v_no_fminimum3_f16__multi_use:
 ; GFX942:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index fb2448fb80744..7c0d3692242a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -3,8 +3,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-FLUSH %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-DENORM %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-DENORM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FLUSH %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-DENORM,GFX11-DENORM-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-DENORM,GFX11-DENORM-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign  -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s
 
 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
 ; make add an instruction if the fadd has more than one use.
@@ -443,47 +445,90 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; GFX10-FLUSH-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
-; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16:
-; GFX11-DENORM:       ; %bb.0:
-; GFX11-DENORM-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-DENORM-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v0, s0, -1.0
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v1, s1, -1.0
-; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-DENORM-NEXT:    v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
-; GFX11-DENORM-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
-; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DENORM-NEXT:    v_mul_f16_e32 v1, v0, v0
-; GFX11-DENORM-NEXT:    v_fma_f16 v0, -v1, v0, 1.0
-; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-DENORM-NEXT:    s_endpgm
-;
-; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16:
-; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b32 s0, s[4:5], 0x8
-; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, s0, -1.0
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v1, s1, -1.0
-; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FLUSH-NEXT:    v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
-; GFX11-FLUSH-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v0, v0
-; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, v1, v0
-; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v0, 1.0, v0
-; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-FLUSH-NEXT:    s_endpgm
+; GFX11-DENORM-TRUE16-LABEL: multiple_fadd_use_test_f16:
+; GFX11-DENORM-TRUE16:       ; %bb.0:
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-DENORM-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-TRUE16-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-DENORM-TRUE16-NEXT:    v_add_f16_e64 v0.h, s0, -1.0
+; GFX11-DENORM-TRUE16-NEXT:    v_add_f16_e64 v0.l, s1, -1.0
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-DENORM-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-TRUE16-NEXT:    v_cmp_gt_f16_e64 s2, |v0.l|, |v0.h|
+; GFX11-DENORM-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v0.l, s2
+; GFX11-DENORM-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-TRUE16-NEXT:    v_add_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX11-DENORM-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v0.l, v0.l
+; GFX11-DENORM-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.h, v0.l, 1.0
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DENORM-FAKE16-LABEL: multiple_fadd_use_test_f16:
+; GFX11-DENORM-FAKE16:       ; %bb.0:
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-DENORM-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-FAKE16-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-DENORM-FAKE16-NEXT:    v_add_f16_e64 v0, s0, -1.0
+; GFX11-DENORM-FAKE16-NEXT:    v_add_f16_e64 v1, s1, -1.0
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-DENORM-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-DENORM-FAKE16-NEXT:    v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
+; GFX11-DENORM-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-DENORM-FAKE16-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
+; GFX11-DENORM-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-FAKE16-NEXT:    v_mul_f16_e32 v1, v0, v0
+; GFX11-DENORM-FAKE16-NEXT:    v_fma_f16 v0, -v1, v0, 1.0
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-FAKE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-TRUE16-LABEL: multiple_fadd_use_test_f16:
+; GFX11-FLUSH-TRUE16:       ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e64 v0.h, s0, -1.0
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e64 v0.l, s1, -1.0
+; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_cmp_gt_f16_e64 s0, |v0.l|, |v0.h|
+; GFX11-FLUSH-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v0.l, s0
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_sub_f16_e32 v0.l, 1.0, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: multiple_fadd_use_test_f16:
+; GFX11-FLUSH-FAKE16:       ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x8
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e64 v0, s0, -1.0
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e64 v1, s1, -1.0
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT:    v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
+; GFX11-FLUSH-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
+; GFX11-FLUSH-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT:    v_mul_f16_e32 v1, v0, v0
+; GFX11-FLUSH-FAKE16-NEXT:    v_mul_f16_e32 v0, v1, v0
+; GFX11-FLUSH-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FLUSH-FAKE16-NEXT:    v_sub_f16_e32 v0, 1.0, v0
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -582,38 +627,71 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
-; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16:
-; GFX11-DENORM:       ; %bb.0:
-; GFX11-DENORM-NEXT:    s_clause 0x1
-; GFX11-DENORM-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v1, s2, s2
-; GFX11-DENORM-NEXT:    v_fma_f16 v2, s2, 2.0, s3
-; GFX11-DENORM-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
-; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1] offset:2 dlc
-; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    s_endpgm
-;
-; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16:
-; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_clause 0x1
-; GFX11-FLUSH-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, s2, s2
-; GFX11-FLUSH-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FLUSH-NEXT:    v_add_f16_e32 v2, s2, v0
-; GFX11-FLUSH-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
-; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    global_store_b16 v1, v2, s[0:1] offset:2 dlc
-; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    s_endpgm
+; GFX11-DENORM-TRUE16-LABEL: multiple_use_fadd_fmac_f16:
+; GFX11-DENORM-TRUE16:       ; %bb.0:
+; GFX11-DENORM-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-DENORM-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-DENORM-TRUE16-NEXT:    v_add_f16_e64 v0.l, s2, s2
+; GFX11-DENORM-TRUE16-NEXT:    v_fma_f16 v0.h, s2, 2.0, s3
+; GFX11-DENORM-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DENORM-FAKE16-LABEL: multiple_use_fadd_fmac_f16:
+; GFX11-DENORM-FAKE16:       ; %bb.0:
+; GFX11-DENORM-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-DENORM-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-FAKE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-DENORM-FAKE16-NEXT:    v_add_f16_e64 v1, s2, s2
+; GFX11-DENORM-FAKE16-NEXT:    v_fma_f16 v2, s2, 2.0, s3
+; GFX11-DENORM-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-FAKE16-NEXT:    global_store_b16 v0, v2, s[0:1] offset:2 dlc
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-TRUE16-LABEL: multiple_use_fadd_fmac_f16:
+; GFX11-FLUSH-TRUE16:       ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT:    s_clause 0x1
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e64 v0.l, s2, s2
+; GFX11-FLUSH-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.h, s2, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: multiple_use_fadd_fmac_f16:
+; GFX11-FLUSH-FAKE16:       ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e64 v0, s2, s2
+; GFX11-FLUSH-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-FLUSH-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e32 v2, s2, v0
+; GFX11-FLUSH-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    global_store_b16 v1, v2, s[0:1] offset:2 dlc
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -705,38 +783,71 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
-; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16:
-; GFX11-DENORM:       ; %bb.0:
-; GFX11-DENORM-NEXT:    s_clause 0x1
-; GFX11-DENORM-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v1, |s2|, |s2|
-; GFX11-DENORM-NEXT:    v_fma_f16 v2, |s2|, 2.0, s3
-; GFX11-DENORM-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
-; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1] offset:2 dlc
-; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    s_endpgm
-;
-; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16:
-; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_clause 0x1
-; GFX11-FLUSH-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, |s2|, |s2|
-; GFX11-FLUSH-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FLUSH-NEXT:    v_add_f16_e32 v2, s2, v0
-; GFX11-FLUSH-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
-; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    global_store_b16 v1, v2, s[0:1] offset:2 dlc
-; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    s_endpgm
+; GFX11-DENORM-TRUE16-LABEL: multiple_use_fadd_fmad_f16:
+; GFX11-DENORM-TRUE16:       ; %bb.0:
+; GFX11-DENORM-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-DENORM-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-DENORM-TRUE16-NEXT:    v_add_f16_e64 v0.l, |s2|, |s2|
+; GFX11-DENORM-TRUE16-NEXT:    v_fma_f16 v0.h, |s2|, 2.0, s3
+; GFX11-DENORM-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DENORM-FAKE16-LABEL: multiple_use_fadd_fmad_f16:
+; GFX11-DENORM-FAKE16:       ; %bb.0:
+; GFX11-DENORM-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-DENORM-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-FAKE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-DENORM-FAKE16-NEXT:    v_add_f16_e64 v1, |s2|, |s2|
+; GFX11-DENORM-FAKE16-NEXT:    v_fma_f16 v2, |s2|, 2.0, s3
+; GFX11-DENORM-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-FAKE16-NEXT:    global_store_b16 v0, v2, s[0:1] offset:2 dlc
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-TRUE16-LABEL: multiple_use_fadd_fmad_f16:
+; GFX11-FLUSH-TRUE16:       ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT:    s_clause 0x1
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e64 v0.l, |s2|, |s2|
+; GFX11-FLUSH-TRUE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.h, s2, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1] offset:2 dlc
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: multiple_use_fadd_fmad_f16:
+; GFX11-FLUSH-FAKE16:       ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e64 v0, |s2|, |s2|
+; GFX11-FLUSH-FAKE16-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX11-FLUSH-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e32 v2, s2, v0
+; GFX11-FLUSH-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    global_store_b16 v1, v2, s[0:1] offset:2 dlc
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -836,41 +947,77 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-FLUSH-NEXT:    s_endpgm
 ;
-; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
-; GFX11-DENORM:       ; %bb.0:
-; GFX11-DENORM-NEXT:    s_clause 0x2
-; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-DENORM-NEXT:    s_load_b32 s6, s[4:5], 0x8
-; GFX11-DENORM-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
-; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-DENORM-NEXT:    v_fma_f16 v2, |s6|, 2.0, s1
-; GFX11-DENORM-NEXT:    v_fma_f16 v1, |s6|, 2.0, s0
-; GFX11-DENORM-NEXT:    global_store_b16 v0, v1, s[2:3] dlc
-; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[2:3] offset:2 dlc
-; GFX11-DENORM-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-DENORM-NEXT:    s_endpgm
-;
-; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
-; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_clause 0x2
-; GFX11-FLUSH-NEXT:    s_load_b32 s6, s[4:5], 0x8
-; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-FLUSH-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
-; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, |s6|, |s6|
-; GFX11-FLUSH-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FLUSH-NEXT:    v_add_f16_e32 v2, s0, v0
-; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, s1, v0
-; GFX11-FLUSH-NEXT:    global_store_b16 v1, v2, s[2:3] dlc
-; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    global_store_b16 v1, v0, s[2:3] offset:2 dlc
-; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-FLUSH-NEXT:    s_endpgm
+; GFX11-DENORM-TRUE16-LABEL: multiple_use_fadd_multi_fmad_f16:
+; GFX11-DENORM-TRUE16:       ; %bb.0:
+; GFX11-DENORM-TRUE16-NEXT:    s_clause 0x2
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x8
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-DENORM-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-TRUE16-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-DENORM-TRUE16-NEXT:    v_fma_f16 v0.h, |s6|, 2.0, s1
+; GFX11-DENORM-TRUE16-NEXT:    v_fma_f16 v0.l, |s6|, 2.0, s0
+; GFX11-DENORM-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3] dlc
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[2:3] offset:2 dlc
+; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DENORM-FAKE16-LABEL: multiple_use_fadd_multi_fmad_f16:
+; GFX11-DENORM-FAKE16:       ; %bb.0:
+; GFX11-DENORM-FAKE16-NEXT:    s_clause 0x2
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x8
+; GFX11-DENORM-FAKE16-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-DENORM-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-FAKE16-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-DENORM-FAKE16-NEXT:    v_fma_f16 v2, |s6|, 2.0, s1
+; GFX11-DENORM-FAKE16-NEXT:    v_fma_f16 v1, |s6|, 2.0, s0
+; GFX11-DENORM-FAKE16-NEXT:    global_store_b16 v0, v1, s[2:3] dlc
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-FAKE16-NEXT:    global_store_b16 v0, v2, s[2:3] offset:2 dlc
+; GFX11-DENORM-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-DENORM-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-TRUE16-LABEL: multiple_use_fadd_multi_fmad_f16:
+; GFX11-FLUSH-TRUE16:       ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT:    s_clause 0x2
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x8
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e64 v0.l, |s6|, |s6|
+; GFX11-FLUSH-TRUE16-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.h, s0, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, s1, v0.l
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_d16_hi_b16 v1, v0, s[2:3] dlc
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3] offset:2 dlc
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: multiple_use_fadd_multi_fmad_f16:
+; GFX11-FLUSH-FAKE16:       ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x8
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e64 v0, |s6|, |s6|
+; GFX11-FLUSH-FAKE16-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-FLUSH-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e32 v2, s0, v0
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e32 v0, s1, v0
+; GFX11-FLUSH-FAKE16-NEXT:    global_store_b16 v1, v2, s[2:3] dlc
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    global_store_b16 v1, v0, s[2:3] offset:2 dlc
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-FAKE16-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -914,19 +1061,33 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fmul_x2_xn2_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e64 v0, s2, -4.0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f16_e32 v0, s2, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fmul_x2_xn2_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, s2, -4.0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, s2, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fmul_x2_xn2_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, s2, -4.0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, s2, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -968,19 +1129,33 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fmul_x2_xn3_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e64 v0, 0xc600, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f16_e32 v0, s2, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fmul_x2_xn3_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, 0xc600, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, s2, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fmul_x2_xn3_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, 0xc600, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, s2, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_endpgm
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -993,3 +1168,6 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
 
 attributes #0 = { nounwind "unsafe-fp-math"="true" }
 attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-DENORM: {{.*}}
+; GFX11-FLUSH: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 8c91acd5ae024..12daf10594df5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -3,8 +3,10 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
 
 ; Test fmul by power of 2 which is better emitted as ldexp
 
@@ -53,11 +55,41 @@ define double @v_mul_42_f64(double %x) {
 }
 
 define half @v_mul_42_f16(half %x) {
-; GCN-LABEL: v_mul_42_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x5140, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_42_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x5140, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_42_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x5140, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_42_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5140, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_42_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5140, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_42_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5140, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_42_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5140, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 42.0
   ret half %mul
 }
@@ -2644,174 +2676,684 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 
 ; 0x1p-23
 define half @v_mul_0x1pn23_f16(half %x) {
-; GCN-LABEL: v_mul_0x1pn23_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 2, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_0x1pn23_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_0x1pn23_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_0x1pn23_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 2, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_0x1pn23_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_0x1pn23_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 2, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_0x1pn23_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 2, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0xH0002
   ret half %mul
 }
 
 ; 0x1p-17
 define half @v_mul_0x1pn17_f16(half %x) {
-; GCN-LABEL: v_mul_0x1pn17_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x80, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_0x1pn17_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x80, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_0x1pn17_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x80, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_0x1pn17_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x80, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_0x1pn17_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x80, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_0x1pn17_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x80, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_0x1pn17_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x80, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.00000762939453125
   ret half %mul
 }
 
 ; 0x1p-16
 define half @v_mul_0x1pn16_f16(half %x) {
-; GCN-LABEL: v_mul_0x1pn16_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x100, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_0x1pn16_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x100, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_0x1pn16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x100, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_0x1pn16_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x100, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_0x1pn16_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x100, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_0x1pn16_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x100, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_0x1pn16_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x100, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.0000152587890625
   ret half %mul
 }
 
 ; 0x1p-15
 define half @v_mul_0x1pn15_f16(half %x) {
-; GCN-LABEL: v_mul_0x1pn15_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x200, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_0x1pn15_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x200, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_0x1pn15_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x200, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_0x1pn15_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x200, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_0x1pn15_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x200, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_0x1pn15_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x200, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_0x1pn15_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x200, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.000030517578125
   ret half %mul
 }
 
 define half @v_mul_neg256_f16(half %x) {
-; GCN-LABEL: v_mul_neg256_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0xdc00, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg256_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0xdc00, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg256_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0xdc00, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg256_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xdc00, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg256_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xdc00, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg256_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xdc00, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg256_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xdc00, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -256.0
   ret half %mul
 }
 
 define half @v_mul_neg128_f16(half %x) {
-; GCN-LABEL: v_mul_neg128_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0xd800, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg128_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0xd800, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg128_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0xd800, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg128_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xd800, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg128_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xd800, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg128_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xd800, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg128_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xd800, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -128.0
   ret half %mul
 }
 
 define half @v_mul_neg64_f16(half %x) {
-; GCN-LABEL: v_mul_neg64_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0xd400, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg64_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0xd400, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg64_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0xd400, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg64_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xd400, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg64_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xd400, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg64_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xd400, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg64_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xd400, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -64.0
   ret half %mul
 }
 
 define half @v_mul_neg32_f16(half %x) {
-; GCN-LABEL: v_mul_neg32_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0xd000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg32_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0xd000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0xd000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xd000, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xd000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xd000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xd000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -32.0
   ret half %mul
 }
 
 define half @v_mul_neg16_f16(half %x) {
-; GCN-LABEL: v_mul_neg16_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0xcc00, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg16_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0xcc00, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0xcc00, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg16_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xcc00, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg16_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xcc00, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg16_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xcc00, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg16_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xcc00, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -16.0
   ret half %mul
 }
 
 define half @v_mul_neg8_f16(half %x) {
-; GCN-LABEL: v_mul_neg8_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0xc800, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg8_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0xc800, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg8_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0xc800, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg8_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xc800, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg8_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xc800, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg8_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xc800, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg8_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xc800, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -8.0
   ret half %mul
 }
 
 define half @v_mul_neg4_f16(half %x) {
-; GCN-LABEL: v_mul_neg4_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, -4.0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg4_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, -4.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg4_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, -4.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg4_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, -4.0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg4_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, -4.0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg4_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, -4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg4_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, -4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -4.0
   ret half %mul
 }
 
 define half @v_mul_neg2_f16(half %x) {
-; GCN-LABEL: v_mul_neg2_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, -2.0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg2_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, -2.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg2_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, -2.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg2_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, -2.0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg2_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, -2.0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg2_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, -2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg2_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, -2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -2.0
   ret half %mul
 }
 
 define half @v_mul_neg1_f16(half %x) {
-; GCN-LABEL: v_mul_neg1_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg1_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg1_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg1_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg1_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg1_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg1_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -1.0
   ret half %mul
 }
 
 define half @v_mul_neg_half_f16(half %x) {
-; GCN-LABEL: v_mul_neg_half_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, -0.5, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg_half_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, -0.5, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg_half_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, -0.5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg_half_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, -0.5, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg_half_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, -0.5, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg_half_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, -0.5, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg_half_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, -0.5, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -0.5
   ret half %mul
 }
 
 define half @v_mul_neg_quarter_f16(half %x) {
-; GCN-LABEL: v_mul_neg_quarter_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0xb400, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_neg_quarter_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0xb400, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_neg_quarter_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0xb400, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_neg_quarter_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xb400, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_neg_quarter_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xb400, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_neg_quarter_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0xb400, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_neg_quarter_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0xb400, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, -0.25
   ret half %mul
 }
 
 define half @v_mul_quarter_f16(half %x) {
-; GCN-LABEL: v_mul_quarter_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x3400, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_quarter_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x3400, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_quarter_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x3400, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_quarter_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x3400, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_quarter_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x3400, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_quarter_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x3400, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_quarter_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x3400, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.25
   ret half %mul
 }
 
 define half @v_mul_half_f16(half %x) {
-; GCN-LABEL: v_mul_half_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0.5, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_half_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_half_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_half_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.5, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_half_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_half_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.5, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_half_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 0.5
   ret half %mul
 }
@@ -2850,109 +3392,391 @@ define half @v_mul_2_f16(half %x) {
 ; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, 2.0, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_2_f16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_2_f16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, 2.0, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_2_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_2_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_2_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 2.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_2_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 2.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 2.0
   ret half %mul
 }
 
 define half @v_mul_4_f16(half %x) {
-; GCN-LABEL: v_mul_4_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 4.0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_4_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_4_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_4_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 4.0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_4_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_4_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 4.0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_4_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 4.0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 4.0
   ret half %mul
 }
 
 define half @v_mul_8_f16(half %x) {
-; GCN-LABEL: v_mul_8_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x4800, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_8_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x4800, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_8_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x4800, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_8_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x4800, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_8_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x4800, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_8_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x4800, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_8_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x4800, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 8.0
   ret half %mul
 }
 
 define half @v_mul_16_f16(half %x) {
-; GCN-LABEL: v_mul_16_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x4c00, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_16_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x4c00, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_16_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x4c00, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_16_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x4c00, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_16_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x4c00, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_16_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x4c00, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_16_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x4c00, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 16.0
   ret half %mul
 }
 
 define half @v_mul_32_f16(half %x) {
-; GCN-LABEL: v_mul_32_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_32_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5000, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 32.0
   ret half %mul
 }
 
 define half @v_mul_64_f16(half %x) {
-; GCN-LABEL: v_mul_64_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x5400, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %mul = fmul half %x, 64.0
-  ret half %mul
-}
-
+; GFX9-LABEL: v_mul_64_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x5400, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_64_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x5400, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_64_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5400, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_64_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5400, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_64_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5400, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_64_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5400, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %mul = fmul half %x, 64.0
+  ret half %mul
+}
+
 define half @v_mul_128_f16(half %x) {
-; GCN-LABEL: v_mul_128_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x5800, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_128_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x5800, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_128_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x5800, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_128_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5800, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_128_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5800, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_128_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5800, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_128_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5800, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 128.0
   ret half %mul
 }
 
 define half @v_mul_256_f16(half %x) {
-; GCN-LABEL: v_mul_256_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x5c00, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_256_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x5c00, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_256_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x5c00, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_256_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5c00, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_256_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5c00, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_256_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5c00, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_256_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5c00, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 256.0
   ret half %mul
 }
 
 ; 0x1p+15
 define half @v_mul_0x1p15_f16(half %x) {
-; GCN-LABEL: v_mul_0x1p15_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x7800, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_0x1p15_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x7800, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_0x1p15_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x7800, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_0x1p15_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x7800, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_0x1p15_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x7800, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_0x1p15_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x7800, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_0x1p15_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x7800, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 32768.0
   ret half %mul
 }
 
 ; 0x1p+14
 define half @v_mul_0x1p14_f16(half %x) {
-; GCN-LABEL: v_mul_0x1p14_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x7400, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_mul_0x1p14_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x7400, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_mul_0x1p14_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x7400, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_0x1p14_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x7400, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_0x1p14_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x7400, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_0x1p14_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x7400, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_0x1p14_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x7400, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul half %x, 16384.0
   ret half %mul
 }
@@ -2973,11 +3797,36 @@ define half @v_fma_mul_add_32_f16(half %x, half %y) {
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v2, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_fma_mul_add_32_f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_fmamk_f16 v0, v0, 0x5000, v1
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_fma_mul_add_32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fmamk_f16 v0, v0, 0x5000, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_fma_mul_add_32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0x5000, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_fma_mul_add_32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_fmamk_f16 v0, v0, 0x5000, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_fma_mul_add_32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, 0x5000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_fma_mul_add_32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_fmamk_f16 v0, v0, 0x5000, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract half %x, 32.0
   %fma = fadd contract half %mul, %y
   ret half %fma
@@ -2998,11 +3847,35 @@ define half @v_fma_mul_sub_32_f16(half %x, half %y) {
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v2, -v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_fma_mul_sub_32_f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_fma_f16 v0, v0, 0x5000, -v1
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_fma_mul_sub_32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f16 v0, v0, 0x5000, -v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_fma_mul_sub_32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 0x5000, -v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_fma_mul_sub_32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v0, 0x5000, -v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_fma_mul_sub_32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, 0x5000, -v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_fma_mul_sub_32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_fma_f16 v0, v0, 0x5000, -v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract half %x, 32.0
   %fma = fsub contract half %mul, %y
   ret half %fma
@@ -3023,11 +3896,36 @@ define half @v_fma_mul_add_neg32_f16(half %x, half %y) {
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v2, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_fma_mul_add_neg32_f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_fmamk_f16 v0, v0, 0xd000, v1
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_fma_mul_add_neg32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fmamk_f16 v0, v0, 0xd000, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_fma_mul_add_neg32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0xd000, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_fma_mul_add_neg32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_fmamk_f16 v0, v0, 0xd000, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_fma_mul_add_neg32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, 0xd000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_fma_mul_add_neg32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_fmamk_f16 v0, v0, 0xd000, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract half %x, -32.0
   %fma = fadd contract half %mul, %y
   ret half %fma
@@ -3048,11 +3946,35 @@ define half @v_mul_fabs_32_f16(half %x) {
 ; GFX9-GISEL-NEXT:    v_mul_f16_e64 v0, |v0|, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_fabs_32_f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_mul_f16_e64 v0, 0x5000, |v0|
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_mul_fabs_32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e64 v0, 0x5000, |v0|
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_fabs_32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e64 v0.l, 0x5000, |v0.l|
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_fabs_32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e64 v0, 0x5000, |v0|
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_fabs_32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e64 v0.l, 0x5000, |v0.l|
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_fabs_32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e64 v0, 0x5000, |v0|
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call half @llvm.fabs.f16(half %x)
   %mul = fmul half %x.fabs, 32.0
   ret half %mul
@@ -3073,11 +3995,35 @@ define half @v_mul_add_fma_fabs_32_f16(half %x, half %y) {
 ; GFX9-GISEL-NEXT:    v_fma_f16 v0, |v0|, v2, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_add_fma_fabs_32_f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_fma_f16 v0, |v0|, 0x5000, v1
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_mul_add_fma_fabs_32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_fma_f16 v0, |v0|, 0x5000, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_add_fma_fabs_32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, |v0.l|, 0x5000, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_add_fma_fabs_32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, |v0|, 0x5000, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_add_fma_fabs_32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_fma_f16 v0.l, |v0.l|, 0x5000, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_add_fma_fabs_32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_fma_f16 v0, |v0|, 0x5000, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call half @llvm.fabs.f16(half %x)
   %mul = fmul contract half %x.fabs, 32.0
   %fma = fadd contract half %mul, %y
@@ -3268,12 +4214,40 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX1011-LABEL: s_mul_32_f16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    v_mul_f16_e64 v0, 0x5000, s0
-; GFX1011-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX1011-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX1011-NEXT:    ; return to shader part epilog
+; GFX10-LABEL: s_mul_32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    v_mul_f16_e64 v0, 0x5000, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-TRUE16-LABEL: s_mul_32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e64 v0.l, 0x5000, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-SDAG-FAKE16-LABEL: s_mul_32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e64 v0, 0x5000, s0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-GISEL-TRUE16-LABEL: s_mul_32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e64 v0.l, 0x5000, s0
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-GISEL-FAKE16-LABEL: s_mul_32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e64 v0, 0x5000, s0
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %mul = fmul contract half %x, 32.0
   %cast = bitcast half %mul to i16
   %zext = zext i16 %cast to i32
@@ -4678,13 +5652,45 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_f16_select_64_1:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_mul_f16_select_64_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_f16_select_64_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_f16_select_64_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_f16_select_64_1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_f16_select_64_1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 64.0, half 1.0
   %mul = fmul half %x, %select.pow2
@@ -4700,13 +5706,45 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_f16_select_1_64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX1011-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_mul_f16_select_1_64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_f16_select_1_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_f16_select_1_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_f16_select_1_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_f16_select_1_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 1.0, half 64.0
   %mul = fmul half %x, %select.pow2
@@ -4722,13 +5760,45 @@ define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
 ; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_f16_select_n1_n64:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_mul_f16_select_n1_n64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX10-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_f16_select_n1_n64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_f16_select_n1_n64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_f16_select_n1_n64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_f16_select_n1_n64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -1.0, half -64.0
   %mul = fmul half %x, %select.pow2
@@ -4775,24 +5845,43 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f16_select_128_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_f16_select_128_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_f16_select_128_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_f16_select_128_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_f16_select_128_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_f16_select_128_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
   %mul = fmul half %x, %select.pow2
@@ -4839,24 +5928,43 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_f16_select_n128_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_f16_select_n128_n64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_f16_select_n128_n64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_f16_select_n128_n64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_f16_select_n128_n64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -64.0
   %mul = fmul half %x, %select.pow2
@@ -4872,13 +5980,45 @@ define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
 ; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_mul_f16_select_n128_n16:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1011-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX1011-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_mul_f16_select_n128_n16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX10-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_mul_f16_select_n128_n16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_f16_select_n128_n16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_f16_select_n128_n16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_f16_select_n128_n16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half -128.0, half -16.0
   %mul = fmul half %x, %select.pow2
@@ -4923,23 +6063,41 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_64_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_64_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x5400
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_64_1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_64_1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 64.0, half 1.0
   %mul = fmul contract half %x, %select.pow2
@@ -4985,23 +6143,41 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_1_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_1_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_1_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_1_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half 1.0, half 64.0
   %mul = fmul contract half %x, %select.pow2
@@ -5047,23 +6223,41 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_n64_n1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd400
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_n64_n1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0xd400
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_n64_n1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_n64_n1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -64.0, half -1.0
   %mul = fmul contract half %x, %select.pow2
@@ -5109,23 +6303,41 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_n1_n64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_n1_n64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0xbc00
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_n1_n64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e64 v0.l, -v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_n1_n64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e64 v0, -v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select contract i1 %cond, half -1.0, half -64.0
   %mul = fmul contract half %x, %select.pow2
@@ -5178,26 +6390,47 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_128_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_128_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x5800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_128_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_128_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 64.0
   %mul = fmul contract half %x, %select.pow2
@@ -5243,23 +6476,41 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_128_4:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_128_4:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x5800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_128_4:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_128_4:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 128.0, half 4.0
   %mul = fmul contract half %x, %select.pow2
@@ -5312,26 +6563,47 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_2_4:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4000
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_2_4:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_2_4:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_2_4:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 2.0, half 4.0
   %mul = fmul contract half %x, %select.pow2
@@ -5377,23 +6649,41 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4400
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_f16_select_4_128:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x4400
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5800, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_fma_f16 v0.l, v1.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_f16_select_4_128:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x4400
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_fma_f16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_f16_select_4_128:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_f16_select_4_128:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
   %select.pow2 = select i1 %cond, half 4.0, half 128.0
   %mul = fmul contract half %x, %select.pow2
@@ -5452,31 +6742,54 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_v2f16_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_64_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_v2f16_select_64_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x5400
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_v2f16_select_64_1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_v2f16_select_64_1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v3, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
   %mul = fmul <2 x half> %x, %select.pow2
@@ -5534,31 +6847,54 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_v2f16_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_1_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_v2f16_select_1_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_v2f16_select_1_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_v2f16_select_1_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v3, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
   %mul = fmul <2 x half> %x, %select.pow2
@@ -5618,32 +6954,56 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_n1_n64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xbc00
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xd400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_v2f16_select_n1_n64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0xbc00
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_v2f16_select_n1_n64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_v2f16_select_n1_n64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v3, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
   %mul = fmul <2 x half> %x, %select.pow2
@@ -5712,36 +7072,63 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_v2f16_select_128_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_128_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_v2f16_select_128_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x5800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_v2f16_select_128_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v3, 0x7fff :: v_dual_add_nc_u32 v0, 6, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_v2f16_select_128_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
   %mul = fmul <2 x half> %x, %select.pow2
@@ -5812,37 +7199,65 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 6, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_n128_n64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xd400, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_v2f16_select_n128_n64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0xd800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_v2f16_select_n128_n64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v3, 0x7fff :: v_dual_add_nc_u32 v0, 6, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_v2f16_select_n128_n64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v1, 6, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -64.0, half -64.0>
   %mul = fmul <2 x half> %x, %select.pow2
@@ -5902,32 +7317,56 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v3, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_mul_v2f16_select_n128_n16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xcc00, v3.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xcc00, v3.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_mul_v2f16_select_n128_n16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v3, 0xd800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_f16 v0, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_mul_v2f16_select_n128_n16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_mul_v2f16_select_n128_n16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 4, 7, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 4, 7, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v3, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -128.0, half -128.0>, <2 x half> <half -16.0, half -16.0>
   %mul = fmul <2 x half> %x, %select.pow2
@@ -5987,32 +7426,56 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_64_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5400
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_64_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x5400
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_64_1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_64_1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 64.0, half 64.0>, <2 x half> <half 1.0, half 1.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6073,32 +7536,56 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_1_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3c00
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_1_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x3c00
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_1_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_1_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 1.0, half 1.0>, <2 x half> <half 64.0, half 64.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6161,33 +7648,58 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xd400
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0xd400
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 6, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -64.0, half -64.0>, <2 x half> <half -1.0, half -1.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6250,33 +7762,58 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xbc00
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xd400, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xd400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0xbc00
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 6, 0, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half -1.0, half -1.0>, <2 x half> <half -64.0, half -64.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6348,36 +7885,64 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 6, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_128_64:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5400, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_128_64:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x5800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_128_64:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_128_64:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 6, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 6, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v5, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 64.0, half 64.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6438,32 +8003,56 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_128_4:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5800
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_128_4:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x5800
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_128_4:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_128_4:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 2, 7, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 2, 7, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 128.0, half 128.0>, <2 x half> <half 4.0, half 4.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6535,36 +8124,64 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1
-; GFX11-GISEL-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v5, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_2_4:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4000
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_2_4:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_2_4:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_2_4:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v4, 0x7fff :: v_dual_add_nc_u32 v1, 2, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v0, 0xffff8000, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v5, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 2.0, half 2.0>, <2 x half> <half 4.0, half 4.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6625,32 +8242,56 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: v_contract_mul_add_v2f16_select_4_128:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4400
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x5800, v4.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x5800, v4.l, s0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_contract_mul_add_v2f16_select_4_128:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x4400
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_contract_mul_add_v2f16_select_4_128:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.h, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_contract_mul_add_v2f16_select_4_128:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 7, 2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 7, 2, vcc_lo
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_pk_add_f16 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
   %select.pow2 = select <2 x i1> %cond, <2 x half> <half 4.0, half 4.0>, <2 x half> <half 128.0, half 128.0>
   %mul = fmul contract <2 x half> %x, %select.pow2
@@ -6757,11 +8398,41 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 }
 
 define half @v_constrained_fmul_32_f16(half %x, half %y) #0 {
-; GCN-LABEL: v_constrained_fmul_32_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_constrained_fmul_32_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_constrained_fmul_32_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: v_constrained_fmul_32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5000, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: v_constrained_fmul_32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: v_constrained_fmul_32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x5000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: v_constrained_fmul_32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x5000, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half 32.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret half %val
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index fe5601594dca8..e9fd6119d0c36 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 declare half @llvm.nearbyint.f16(half) #0
 declare float @llvm.nearbyint.f32(float) #0
@@ -51,16 +52,27 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fnearbyint_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_rndne_f16_e32 v1, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fnearbyint_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fnearbyint_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v1, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %1 = call half @llvm.nearbyint.f16(half %in)
   store half %1, ptr addrspace(1) %out
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 058c273a65d99..ff894d184e6c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define i32 @fneg_xor_select_i32(i1 %cond, i32 %arg0, i32 %arg1) {
 ; GCN-LABEL: fneg_xor_select_i32:
@@ -158,15 +159,25 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
 ; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_xor_select_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fneg_xor_select_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_xor_select_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
   %fneg = xor i16 %select, -32768
   ret i16 %fneg
@@ -206,22 +217,36 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_xor_select_v2i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fneg_xor_select_v2i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, v2.h, s0
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_xor_select_v2i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %select = select <2 x i1> %cond, <2 x i16> %arg0, <2 x i16> %arg1
   %fneg = xor <2 x i16> %select, <i16 -32768, i16 -32768>
   ret <2 x i16> %fneg
@@ -250,16 +275,27 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_xor_select_i16_multi_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v1
-; GFX11-NEXT:    global_store_b16 v[3:4], v1, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fneg_xor_select_i16_multi_use:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v[3:4], v0, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_xor_select_i16_multi_use:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v[3:4], v1, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
   store i16 %select, ptr addrspace(1) %ptr
   %fneg = xor i16 %select, -32768
@@ -578,21 +614,37 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: select_fneg_select_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: select_fneg_select_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: select_fneg_select_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg half %arg0
   %select0 = select i1 %cond0, half %arg1, half %fneg0
   %fneg1 = fneg half %select0
@@ -614,21 +666,37 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: select_fneg_xor_select_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: select_fneg_xor_select_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v3.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: select_fneg_xor_select_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i16 %arg0, -32768
   %select0 = select i1 %cond0, i16 %arg1, i16 %fneg0
   %fneg1 = xor i16 %select0, -32768
@@ -699,33 +767,56 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: select_fneg_select_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_perm_b32 v4, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: select_fneg_select_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v5.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: select_fneg_select_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg <2 x half> %arg0
   %select0 = select <2 x i1> %cond0, <2 x half> %arg1, <2 x half> %fneg0
   %fneg1 = fneg <2 x half> %select0
@@ -784,33 +875,56 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: select_fneg_xor_select_v2i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_perm_b32 v4, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: select_fneg_xor_select_v2i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v4.h, v5.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v4.l, v5.l, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: select_fneg_xor_select_v2i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor <2 x i16> %arg0, <i16 -32768, i16 -32768>
   %select0 = select <2 x i1> %cond0, <2 x i16> %arg1, <2 x i16> %fneg0
   %fneg1 = xor <2 x i16> %select0, <i16 -32768, i16 -32768>
@@ -931,20 +1045,33 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: cospiD_pattern0_half:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 0xffff8000, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: cospiD_pattern0_half:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v0.l, 1
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, v1, v2, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: cospiD_pattern0_half:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 0xffff8000, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, v2, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %i = and i16 %arg, 1
   %i3 = icmp eq i16 %i, 0
   %i4 = select i1 %i3, float %arg2, float %arg1
@@ -980,16 +1107,27 @@ define float @cospiD_pattern1_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, -v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: cospiD_pattern1_half:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, -v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: cospiD_pattern1_half:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, v0.l, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, v1, -v1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: cospiD_pattern1_half:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i16_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v1, -v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %i = and i16 %arg, 1
   %i3 = icmp eq i16 %i, 0
   %i4 = select i1 %i3, float %arg2, float %arg1
@@ -1170,14 +1308,23 @@ define double @fneg_f64_bitcast_build_vector_v4i16_to_f64(i16 %elt0, i16 %elt1,
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_f64_bitcast_build_vector_v4i16_to_f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <4 x i16> poison, i16 %elt0, i32 0
   %insert.1 = insertelement <4 x i16> %insert.0, i16 %elt1, i32 1
   %insert.2 = insertelement <4 x i16> %insert.1, i16 %elt2, i32 2
@@ -1211,14 +1358,23 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <4 x half> poison, half %elt0, i32 0
   %insert.1 = insertelement <4 x half> %insert.0, half %elt1, i32 1
   %insert.2 = insertelement <4 x half> %insert.1, half %elt2, i32 2
@@ -1252,14 +1408,23 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
 ; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v1, 0x80000000, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %insert.0 = insertelement <4 x bfloat> poison, bfloat %elt0, i32 0
   %insert.1 = insertelement <4 x bfloat> %insert.0, bfloat %elt1, i32 1
   %insert.2 = insertelement <4 x bfloat> %insert.1, bfloat %elt2, i32 2

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 87f1303ab8f5d..07a7d8d20c439 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
 ; RUN: not llc -mtriple=r600 -mcpu=redwood < %s
 
 define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
@@ -516,11 +517,29 @@ define double @v_fneg_i64_fp_use(i64 %in) {
 }
 
 define i16 @v_fneg_i16(i16 %in) {
-; GCN-LABEL: v_fneg_i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: v_fneg_i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_fneg_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fneg_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg = xor i16 %in, -32768
   ret i16 %fneg
 }
@@ -551,16 +570,27 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_fneg_i16_fp_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_sub_f16_e64 v1, 2.0, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_i16_fp_use:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_sub_f16_e64 v0.l, 2.0, s2
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_i16_fp_use:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_sub_f16_e64 v1, 2.0, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %fneg = xor i16 %in, -32768
   %bitcast = bitcast i16 %fneg to half
   %fadd = fadd half %bitcast, 2.0
@@ -582,11 +612,17 @@ define half @v_fneg_i16_fp_use(i16 %in) {
 ; VI-NEXT:    v_sub_f16_e32 v0, 2.0, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fneg_i16_fp_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_sub_f16_e32 v0, 2.0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fneg_i16_fp_use:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_sub_f16_e32 v0.l, 2.0, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fneg_i16_fp_use:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_sub_f16_e32 v0, 2.0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg = xor i16 %in, -32768
   %bitcast = bitcast i16 %fneg to half
   %fadd = fadd half %bitcast, 2.0

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 2af12d150154a..63ba18a5433aa 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 declare i16 @llvm.umax.i16(i16, i16)
 declare i64 @llvm.umin.i64(i64, i64)
@@ -208,39 +209,70 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; GFX10-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_pow2_8xhalf:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v4, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v5, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v6, v6
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v7, v7
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v6
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
-; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
-; GFX11-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fmul_pow2_8xhalf:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v4, v2, 1 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v2, v1, 1 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v1.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v2.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v3.h, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.h, v3.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v0.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, 0x7000, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fmul_pow2_8xhalf:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v4, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v5, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v6, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v7, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v6
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
   %p2_f = uitofp <8 x i16> %p2 to <8 x half>
   %r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
@@ -284,28 +316,47 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_pow2_ldexp_8xhalf:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_ldexp_f16_e32 v1, 0x7000, v1
-; GFX11-NEXT:    v_ldexp_f16_e32 v0, 0x7000, v0
-; GFX11-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v6
-; GFX11-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v7
-; GFX11-NEXT:    v_ldexp_f16_e32 v2, 0x7000, v2
-; GFX11-NEXT:    v_ldexp_f16_e32 v3, 0x7000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v6
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_pack_b32_f16 v2, v5, v2
-; GFX11-NEXT:    v_pack_b32_f16 v3, v4, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v3.l, 0x7000, v3.l
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v2.l, 0x7000, v2.l
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, 0x7000, v1.l
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, 0x7000, v0.l
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, 0x7000, v0.h
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, 0x7000, v1.h
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v2.h, 0x7000, v2.h
+; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v3.h, 0x7000, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fmul_pow2_ldexp_8xhalf:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v1, 0x7000, v1
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v0, 0x7000, v0
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v6
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v7
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v2, 0x7000, v2
+; GFX11-FAKE16-NEXT:    v_ldexp_f16_e32 v3, 0x7000, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v6
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v2, v5, v2
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
   ret <8 x half> %r
 }
@@ -590,16 +641,27 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
 ; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_pow_mul_max_pow2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fmul_pow_mul_max_pow2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.l, 2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fmul_pow_mul_max_pow2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, v0, 2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shl2 = shl nuw i16 2, %cnt
   %shl1 = shl nuw i16 1, %cnt
   %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2)
@@ -1029,18 +1091,30 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
   %conv = uitofp <2 x i16> %shl to <2 x half>
   %mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv
@@ -1117,18 +1191,31 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
 ; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fmul_pow_shl_cnt_safe:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff5f3992
-; GFX11-NEXT:    s_mov_b32 s1, 0x7befffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_safe:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.l, 1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0xff5f3992
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0x7befffff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fmul_pow_shl_cnt_safe:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0xff5f3992
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0x7befffff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to double
   %mul = fmul double 9.745314e+288, %conv
@@ -1479,32 +1566,60 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
-; GFX11-NEXT:    s_mov_b32 s0, 0x46000000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v2, 0x46000000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0x46000000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v2, v4, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.h, v0.l, 0x7000
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0x46000000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v2, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i32 1, %cnt
   %conv = uitofp i32 %shl to half
   %mul = fdiv half 0xH7000, %conv
@@ -1526,13 +1641,21 @@ define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
 ; GFX10-NEXT:    v_sub_nc_u16 v0, 0x7000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v0, 10, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u16 v0, 0x7000, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 10, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, 0x7000, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_in_bounds:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, 10, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, 0x7000, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to half
   %mul = fdiv half 0xH7000, %conv
@@ -1554,13 +1677,21 @@ define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
 ; GFX10-NEXT:    v_sub_nc_u16 v0, 0x4800, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v0, 10, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u16 v0, 0x4800, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 10, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.l, 0x4800, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_in_bounds2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, 10, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v0, 0x4800, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to half
   %mul = fdiv half 0xH4800, %conv
@@ -1604,31 +1735,58 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
-; GFX11-NEXT:    s_mov_b32 s0, 2.0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_add_f32_e32 v2, v1, v1
-; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v1
-; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.l, 1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 2.0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v1, v1
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v2, v4, v1
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.h, v0.l, 2.0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 2.0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, v1, v1
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v2, v3, v1
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i16 1, %cnt
   %conv = uitofp i16 %shl to half
   %mul = fdiv half 0xH4000, %conv

diff  --git a/llvm/test/CodeGen/AMDGPU/fpext-free.ll b/llvm/test/CodeGen/AMDGPU/fpext-free.ll
index 3752100b7dc0f..d2343740855d6 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext-free.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext-free.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-F32FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-F32DENORM %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-F32FLUSH,GFX11-F32FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-F32FLUSH,GFX11-F32FLUSH-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-F32DENORM,GFX11-F32DENORM-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-F32DENORM,GFX11-F32DENORM-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32FLUSH %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9-F32DENORM %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
@@ -8,14 +10,23 @@
 
 ;  fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
 define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
-; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fpext_fmul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fpext_fmul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -39,16 +50,27 @@ entry:
 
 ; f16->f64 is not free.
 define double @fadd_fpext_fmul_f16_to_f64(half %x, half %y, double %z) #0 {
-; GFX11-LABEL: fadd_fpext_fmul_f16_to_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fpext_fmul_f16_to_f64:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fpext_fmul_f16_to_f64:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: fadd_fpext_fmul_f16_to_f64:
 ; GFX89:       ; %bb.0: ; %entry
@@ -92,14 +114,23 @@ entry:
 
 ; fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
 define float @fadd_fpext_fmul_f16_to_f32_commute(half %x, half %y, float %z) #0 {
-; GFX11-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -124,14 +155,23 @@ entry:
 ; fold (fadd (fma x, y, (fpext (fmul u, v))), z)
 ;   -> (fma x, y, (fma (fpext u), (fpext v), z))
 define float @fadd_muladd_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
-; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v2.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -160,14 +200,23 @@ entry:
 ; fold (fadd x, (fma y, z, (fpext (fmul u, v)))
 ;   -> (fma y, z, (fma (fpext u), (fpext v), x))
 define float @fadd_muladd_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
-; GFX11-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
-; GFX11-NEXT:    v_add_f32_e32 v0, v4, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v2.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_muladd_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -194,14 +243,23 @@ entry:
 }
 
 define float @fadd_fmad_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
-; GFX11-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v2.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fmad_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -231,14 +289,23 @@ entry:
 ; fold (fadd (fma x, y, (fpext (fmul u, v))), z)
 ;   -> (fma x, y, (fma (fpext u), (fpext v), z))
 define float @fadd_fma_fpext_fmul_f16_to_f32(float %x, float %y, half %u, half %v, float %z) #0 {
-; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v2.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -265,14 +332,23 @@ entry:
 }
 
 define float @fadd_fma_fpext_fmul_f16_to_f32_commute(float %x, float %y, half %u, half %v, float %z) #0 {
-; GFX11-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v2, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
-; GFX11-NEXT:    v_add_f32_e32 v0, v4, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v2.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[0,0,1]
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fma_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -301,16 +377,27 @@ entry:
 ; fold (fadd x, (fpext (fma y, z, (fmul u, v)))
 ;   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
 define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
-; GFX11-LABEL: fadd_fpext_fmuladd_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fpext_fmuladd_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.h, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v1.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fpext_fmuladd_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmuladd_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -336,16 +423,27 @@ entry:
 }
 
 define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, half %v) #0 {
-; GFX11-LABEL: fadd_fpext_fma_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fpext_fma_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.h, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v1.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fpext_fma_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -371,16 +469,27 @@ entry:
 }
 
 define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
-; GFX11-LABEL: fadd_fpext_fma_f16_to_f32_commute:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fadd_fpext_fma_f16_to_f32_commute:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.h, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v1.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fadd_fpext_fma_f16_to_f32_commute:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -408,14 +517,23 @@ entry:
 ; fold (fsub (fpext (fmul x, y)), z)
 ;   -> (fma (fpext x), (fpext y), (fneg z))
 define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
-; GFX11-LABEL: fsub_fpext_fmul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fsub_fpext_fmul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fsub_fpext_fmul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -446,14 +564,23 @@ define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0
 ; GFX11-F32FLUSH-NEXT:    v_fma_mix_f32 v0, -v1, v2, v0 op_sel_hi:[1,1,0]
 ; GFX11-F32FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-F32DENORM-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
-; GFX11-F32DENORM:       ; %bb.0: ; %entry
-; GFX11-F32DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-F32DENORM-NEXT:    v_mul_f16_e32 v1, v1, v2
-; GFX11-F32DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-F32DENORM-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-F32DENORM-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-F32DENORM-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-F32DENORM-TRUE16-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
+; GFX11-F32DENORM-TRUE16:       ; %bb.0: ; %entry
+; GFX11-F32DENORM-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-F32DENORM-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v1.l, v2.l
+; GFX11-F32DENORM-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-F32DENORM-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-F32DENORM-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-F32DENORM-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-F32DENORM-FAKE16-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
+; GFX11-F32DENORM-FAKE16:       ; %bb.0: ; %entry
+; GFX11-F32DENORM-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-F32DENORM-FAKE16-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-F32DENORM-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-F32DENORM-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-F32DENORM-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-F32DENORM-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -478,14 +605,23 @@ entry:
 ; fold (fsub (fpext (fneg (fmul, x, y))), z)
 ;   -> (fneg (fma (fpext x), (fpext y), z))
 define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
-; GFX11-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, v0, -v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fpext_fneg_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -511,14 +647,23 @@ entry:
 ; fold (fsub (fneg (fpext (fmul, x, y))), z)
 ;   -> (fneg (fma (fpext x)), (fpext y), z)
 define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 {
-; GFX11-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e64 v0, v0, -v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, v0, -v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_fneg_fpext_fmul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -544,14 +689,23 @@ entry:
 ; fold (fsub (fmad x, y, (fpext (fmul u, v))), z)
 ;    -> (fmad x, y (fmad (fpext u), (fpext v), (fneg z)))
 define float @fsub_muladd_fpext_mul_f16_to_f32(float %x, float %y, float %z, half %u, half %v) #0 {
-; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v3.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -581,16 +735,27 @@ entry:
 ;    -> (fmad (fpext x), (fpext y),
 ;            (fmad (fpext u), (fpext v), (fneg z)))
 define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half %u, half %v) #0 {
-; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f16_e32 v3, v0, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v0.h, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v3, v0, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32:
 ; GFX89:       ; %bb.0: ; %entry
@@ -611,14 +776,23 @@ entry:
 ; fold (fsub x, (fmad y, z, (fpext (fmul u, v))))
 ;   -> (fmad (fneg y), z, (fmad (fneg (fpext u)), (fpext v), x))
 define float @fsub_muladd_fpext_mul_f16_to_f32_commute(float %x, float %y, float %z, half %u, half %v) #0 {
-; GFX11-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v3.l, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-F32FLUSH-LABEL: fsub_muladd_fpext_mul_f16_to_f32_commute:
 ; GFX9-F32FLUSH:       ; %bb.0: ; %entry
@@ -647,16 +821,27 @@ entry:
 ;    -> (fma (fneg (fpext y)), (fpext z),
 ;            (fma (fneg (fpext u)), (fpext v), x))
 define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half %z, half %u, half %v) #0 {
-; GFX11-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f16_e32 v3, v1, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.h, v3.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v1.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute:
 ; GFX89:       ; %bb.0: ; %entry
@@ -680,3 +865,7 @@ declare half @llvm.fmuladd.f16(half, half, half) #0
 declare half @llvm.fma.f16(half, half, half) #0
 
 attributes #0 = { nounwind readnone speculatable }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-F32DENORM: {{.*}}
+; GFX11-F32FLUSH-FAKE16: {{.*}}
+; GFX11-F32FLUSH-TRUE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index d855707fbe4de..6d383951be9c1 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -4,7 +4,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define float @v_pow_f32(float %x, float %y) {
 ; GFX6-LABEL: v_pow_f32:
@@ -188,19 +189,33 @@ define half @v_pow_f16(half %x, half %y) {
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_pow_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %pow = call half @llvm.pow.f16(half %x, half %y)
   ret half %pow
 }
@@ -297,31 +312,54 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_pow_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    v_log_f32_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v1, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v1.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
   ret <2 x half> %pow
 }
@@ -422,31 +460,54 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_pow_v2f16_fneg_lhs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v2, -v2
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    v_log_f32_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v1, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, -v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v1.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v2, -v2
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
   ret <2 x half> %pow
@@ -548,31 +609,54 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_pow_v2f16_fneg_rhs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, -v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v3, -v3
-; GFX11-NEXT:    v_log_f32_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v1, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_rhs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v3, -v1.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_rhs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v3, -v3
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %y.fneg = fneg <2 x half> %y
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
   ret <2 x half> %pow
@@ -678,31 +762,54 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, -v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v2, -v2
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v3, -v3
-; GFX11-NEXT:    v_log_f32_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f32_e32 v1, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs_rhs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, -v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v3, -v1.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs_rhs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v2, -v2
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v3, -v3
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v1, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %y.fneg = fneg <2 x half> %y
   %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg)

diff  --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 0935438f1b951..d957ba93e4fb3 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -13,8 +13,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri < %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 ; Test patterns to match v_fract_* instructions.
 
@@ -1486,21 +1488,37 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
 ; GFX8-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: basic_fract_f16_nonan:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_fract_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: basic_fract_f16_nonan:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_fract_f16_e32 v0, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: basic_fract_f16_nonan:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: basic_fract_f16_nonan:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: basic_fract_f16_nonan:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: basic_fract_f16_nonan:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -1574,29 +1592,51 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: basic_fract_v2f16_nonan:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_fract_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fract_f16_e32 v1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: basic_fract_v2f16_nonan:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_fract_f16_e32 v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_fract_f16_e32 v1, v1
-; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: basic_fract_v2f16_nonan:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: basic_fract_v2f16_nonan:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.h, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: basic_fract_v2f16_nonan:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_fract_f16_e32 v1, v1
+; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
   %sub = fsub <2 x half> %x, %floor
@@ -1748,25 +1788,45 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: safe_math_fract_f16_noinf_check:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_floor_f16_e32 v3, v0
-; GFX11-NEXT:    v_fract_f16_e32 v0, v0
-; GFX11-NEXT:    global_store_b16 v[1:2], v3, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: safe_math_fract_f16_noinf_check:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_floor_f16_e32 v3, v0
-; GFX12-NEXT:    v_fract_f16_e32 v0, v0
-; GFX12-NEXT:    global_store_b16 v[1:2], v3, off
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: safe_math_fract_f16_noinf_check:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: safe_math_fract_f16_noinf_check:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[1:2], v3, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: safe_math_fract_f16_noinf_check:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v0.h, v0.l
+; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: safe_math_fract_f16_noinf_check:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_floor_f16_e32 v3, v0
+; GFX12-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    global_store_b16 v[1:2], v3, off
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -2365,31 +2425,57 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: safe_math_fract_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_fract_f16_e32 v3, v0
-; GFX11-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
-; GFX11-NEXT:    v_floor_f16_e32 v4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
-; GFX11-NEXT:    global_store_b16 v[1:2], v4, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: safe_math_fract_f16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_fract_f16_e32 v3, v0
-; GFX12-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
-; GFX12-NEXT:    v_floor_f16_e32 v4, v0
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
-; GFX12-NEXT:    global_store_b16 v[1:2], v4, off
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: safe_math_fract_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_neq_f16_e64 s0, 0x7c00, |v0.l|
+; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0, v0.h, s0
+; GFX11-TRUE16-NEXT:    global_store_b16 v[1:2], v3, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: safe_math_fract_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
+; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v4, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    global_store_b16 v[1:2], v4, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: safe_math_fract_f16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.h, v0.l
+; GFX12-TRUE16-NEXT:    v_cmp_neq_f16_e64 s0, 0x7c00, |v0.l|
+; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v3.l, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0, v0.h, s0
+; GFX12-TRUE16-NEXT:    global_store_b16 v[1:2], v3, off
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: safe_math_fract_f16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_fract_f16_e32 v3, v0
+; GFX12-FAKE16-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
+; GFX12-FAKE16-NEXT:    v_floor_f16_e32 v4, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    global_store_b16 v[1:2], v4, off
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call half @llvm.floor.f16(half %x)
   %sub = fsub half %x, %floor
@@ -2538,49 +2624,89 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: safe_math_fract_v2f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_fract_f16_e32 v6, v0
-; GFX11-NEXT:    v_floor_f16_e32 v5, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fract_f16_e32 v4, v3
-; GFX11-NEXT:    v_cmp_class_f16_e64 s0, v3, 0x204
-; GFX11-NEXT:    v_floor_f16_e32 v7, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
-; GFX11-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v4, v5, v7
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
-; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: safe_math_fract_v2f16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    v_fract_f16_e32 v6, v0
-; GFX12-NEXT:    v_floor_f16_e32 v5, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_fract_f16_e32 v4, v3
-; GFX12-NEXT:    v_cmp_class_f16_e64 s0, v3, 0x204
-; GFX12-NEXT:    v_floor_f16_e32 v7, v3
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
-; GFX12-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_pack_b32_f16 v4, v5, v7
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
-; GFX12-NEXT:    global_store_b32 v[1:2], v4, off
-; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: safe_math_fract_v2f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v3.h, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.h, 0x204
+; GFX11-TRUE16-NEXT:    v_cmp_class_f16_e64 s1, v0.l, 0x204
+; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, 0, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, 0, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: safe_math_fract_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v5, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v4, v3
+; GFX11-FAKE16-NEXT:    v_cmp_class_f16_e64 s0, v3, 0x204
+; GFX11-FAKE16-NEXT:    v_floor_f16_e32 v7, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
+; GFX11-FAKE16-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v4, v5, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX11-FAKE16-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: safe_math_fract_v2f16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v3.l, v0.l
+; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v3.h, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.h, 0x204
+; GFX12-TRUE16-NEXT:    v_cmp_class_f16_e64 s1, v0.l, 0x204
+; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v0.h, v0.h
+; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, 0, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, 0, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v4, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v3.l, v3.h
+; GFX12-TRUE16-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: safe_math_fract_v2f16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    v_fract_f16_e32 v6, v0
+; GFX12-FAKE16-NEXT:    v_floor_f16_e32 v5, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_fract_f16_e32 v4, v3
+; GFX12-FAKE16-NEXT:    v_cmp_class_f16_e64 s0, v3, 0x204
+; GFX12-FAKE16-NEXT:    v_floor_f16_e32 v7, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
+; GFX12-FAKE16-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v4, v5, v7
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX12-FAKE16-NEXT:    global_store_b32 v[1:2], v4, off
+; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
   %sub = fsub <2 x half> %x, %floor

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 25fe57c16c661..ee62359cffc63 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca,+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s
 
 ; Test that non-entry function frame indices are expanded properly to
 ; give an index relative to the scratch wave offset register
@@ -303,13 +304,20 @@ ret:
 %type.i16 = type { i16 }
 @_ZZN0 = external hidden addrspace(3) global %struct0, align 8
 
-; GFX11-LABEL: tied_operand_test:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
-; GFX11:     v_dual_mov_b32 [[C:v[0-9]+]], 0x7b :: v_dual_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
-; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
-; GFX11-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: tied_operand_test:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16:     scratch_load_d16_b16 [[LDRESULT:v[0-9]+]], off, off
+; GFX11-TRUE16:     v_mov_b16_e32 [[C:v[0-9]]].{{(l|h)}}, 0x7b
+; GFX11-TRUE16-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: tied_operand_test:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16:     scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off
+; GFX11-FAKE16:     v_dual_mov_b32 [[C:v[0-9]+]], 0x7b :: v_dual_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
+; GFX11-FAKE16-DAG:     ds_store_b16 v{{[0-9]+}}, [[LDRESULT]]  offset:10
+; GFX11-FAKE16-DAG:     ds_store_b16 v{{[0-9]+}}, [[C]]  offset:8
+; GFX11-FAKE16-NEXT:    s_endpgm
 define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
 entry:
   %scratch0 = alloca i16, align 4, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index b1732b905e4c1..ff9b0641e43d8 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -14,8 +14,11 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG  %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16 enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
 define void @freeze_v2i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX6-SDAG-LABEL: freeze_v2i32:
@@ -5560,13 +5563,29 @@ define void @freeze_i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: freeze_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_i16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %a = load i16, ptr addrspace(1) %ptra
   %freeze = freeze i16 %a
   store i16 %freeze, ptr addrspace(1) %ptrb
@@ -6203,13 +6222,29 @@ define void @freeze_f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: freeze_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_f16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %a = load half, ptr addrspace(1) %ptra
   %freeze = freeze half %a
   store half %freeze, ptr addrspace(1) %ptrb
@@ -6852,13 +6887,29 @@ define void @freeze_bf16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: freeze_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_bf16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_bf16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_bf16:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %a = load bfloat, ptr addrspace(1) %ptra
   %freeze = freeze bfloat %a
   store bfloat %freeze, ptr addrspace(1) %ptrb
@@ -12115,13 +12166,29 @@ define void @freeze_i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-NEXT:    global_store_byte v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: freeze_i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: freeze_i8:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %a = load i8, ptr addrspace(1) %ptra
   %freeze = freeze i8 %a
   store i8 %freeze, ptr addrspace(1) %ptrb
@@ -12235,13 +12302,21 @@ define void @freeze_v2i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: freeze_v2i8:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v2i8:
 ; GFX11-GISEL:       ; %bb.0:
@@ -12423,20 +12498,36 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_byte_d16_hi v[2:3], v0, off offset:2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: freeze_v3i8:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b16 v1, 8, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX11-SDAG-NEXT:    s_clause 0x1
-; GFX11-SDAG-NEXT:    global_store_b8 v[2:3], v0, off offset:2
-; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v1, off
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_clause 0x1
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v4, off offset:2
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v1, 8, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_clause 0x1
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off offset:2
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v1, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v3i8:
 ; GFX11-GISEL:       ; %bb.0:
@@ -13407,14 +13498,23 @@ define void @freeze_v2i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_byte v[2:3], v0, off
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: freeze_v2i1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX11-SDAG-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_v2i1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_v2i1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v2i1:
 ; GFX11-GISEL:       ; %bb.0:
@@ -13573,14 +13673,23 @@ define void @freeze_v3i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_byte v[2:3], v0, off
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: freeze_v3i1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 7, v0
-; GFX11-SDAG-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_v3i1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 7
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_v3i1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v3i1:
 ; GFX11-GISEL:       ; %bb.0:
@@ -13863,20 +13972,35 @@ define void @freeze_v2i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_byte v[2:3], v0, off
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: freeze_v2i1_vcc:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 1, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 3, v0
-; GFX11-SDAG-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_v2i1_vcc:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_b64 v[4:5], v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 1, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 3
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_v2i1_vcc:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 3, v0
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v2i1_vcc:
 ; GFX11-GISEL:       ; %bb.0:
@@ -14089,24 +14213,43 @@ define void @freeze_v3i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_byte v[2:3], v0, off
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: freeze_v3i1_vcc:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b96 v[4:6], v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 2, v4
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 7, v0
-; GFX11-SDAG-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_v3i1_vcc:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 1, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 2, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 7
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_v3i1_vcc:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 2, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v3i1_vcc:
 ; GFX11-GISEL:       ; %bb.0:
@@ -14361,28 +14504,52 @@ define void @freeze_v4i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_byte v[2:3], v0, off
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: freeze_v4i1_vcc:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v4, 2, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 3, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX11-SDAG-NEXT:    global_store_b8 v[2:3], v0, off
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: freeze_v4i1_vcc:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 1, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 2, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 3, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, 15
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: freeze_v4i1_vcc:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v4, 2, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 3, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v4i1_vcc:
 ; GFX11-GISEL:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 5febd5256e794..fbe253e95d210 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -4,7 +4,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
 
 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
@@ -180,41 +181,79 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: frem_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: frem_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.h, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: frem_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_f16 v1, -v3, v2, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_f16:
 ; GFX1150:       ; %bb.0:
@@ -377,25 +416,45 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fast_frem_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fast_frem_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v1.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fast_frem_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_fma_f16 v1, -v3, v2, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: fast_frem_f16:
 ; GFX1150:       ; %bb.0:
@@ -542,25 +601,45 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: unsafe_frem_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: unsafe_frem_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v1.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: unsafe_frem_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_fma_f16 v1, -v3, v2, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: unsafe_frem_f16:
 ; GFX1150:       ; %bb.0:
@@ -2109,65 +2188,128 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: frem_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
-; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v8, v7
-; GFX11-NEXT:    v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v7
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f16 v1, -v1, v6, v4
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: frem_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b32 v3, v1, s[4:5] offset:16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v0, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v0, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v6.l
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v5, v4.l
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v8, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v5, v8, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v2, -v3, v5, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, v2, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v6.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.h, -v0.h, v6.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: frem_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    v_fma_f16 v3, -v3, v2, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v5, v8, v7
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v1, v1, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_f16 v1, -v1, v6, v4
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v2f16:
 ; GFX1150:       ; %bb.0:
@@ -2683,110 +2825,217 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: frem_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v9, v9
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v6
-; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v9
-; GFX11-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_fma_f16 v5, -v5, v3, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v10, v9
-; GFX11-NEXT:    v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
-; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f16 v1, -v1, v8, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; GFX11-NEXT:    v_pack_b32_f16 v1, v5, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v8, v8
-; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_mul_f32_e32 v5, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
-; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX11-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v0
-; GFX11-NEXT:    v_fmac_f32_e32 v6, v9, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f16_e32 v0, v0
-; GFX11-NEXT:    v_fma_f16 v0, -v0, v7, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: frem_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v5, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v4.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v9, v8.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v9, v9
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v6
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v6, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v6.l
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v10, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v4.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v7, v10, v9
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v2, -v4, v7, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, v2, v9
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v8.l, v6.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.h, -v0.h, v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v8, -v6, v0, v7 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v8, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v6, -v6, v0, v7 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v6, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v9, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v1, -v3, v6, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, v1, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v7.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.h, -v0.h, v7.l, v4.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b64 v5, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: frem_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v9, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v9, v9
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v5, v7, v6
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v7, v7, v9
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    v_fma_f16 v5, -v5, v3, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v7, v10, v9
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_fma_f16 v1, -v1, v8, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v5, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v3, v6, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_fma_f16 v3, -v3, v2, v0
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-FAKE16-NEXT:    v_add_f32_e32 v0, v0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_fma_f16 v0, -v0, v7, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX11-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1150-LABEL: frem_v4f16:
 ; GFX1150:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 7061685729804..0db2a1679197e 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
 ; GFX9-LABEL: void_func_i1_inreg:
@@ -33,12 +34,19 @@ define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i8_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_i8_inreg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_i8_inreg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store i8 %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -52,12 +60,19 @@ define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_i16_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_i16_inreg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_i16_inreg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store i16 %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -110,12 +125,19 @@ define void @void_func_f16_inreg(half inreg %arg0) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_f16_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_f16_inreg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_f16_inreg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store half %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -1292,45 +1314,85 @@ define void @void_func_v32i32_i1_i8_i16_f32_inreg(<32 x i32> inreg %arg0, i1 inr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v21, v1 :: v_dual_mov_b32 v20, v0
-; GFX11-NEXT:    v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29
-; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25
-; GFX11-NEXT:    v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27
-; GFX11-NEXT:    v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
-; GFX11-NEXT:    v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23
-; GFX11-NEXT:    v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
-; GFX11-NEXT:    v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
-; GFX11-NEXT:    v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1
-; GFX11-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v14
-; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b128 v[0:1], v[4:7], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b128 v[0:1], v[8:11], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[0:1], v12, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b8 v[0:1], v15, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b16 v[0:1], v16, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b16 v[0:1], v17, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v21, v1 :: v_dual_mov_b32 v20, v0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s25
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v8, s26 :: v_dual_mov_b32 v9, s27
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v11, s21
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v13, s23
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v22, s16 :: v_dual_mov_b32 v23, s17
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v24, s18 :: v_dual_mov_b32 v25, s19
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[2:5], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[22:25], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 1, v14
+; GFX11-TRUE16-NEXT:    global_store_b128 v[0:1], v[0:3], off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v4, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v15, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v16, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v17, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v32i32_i1_i8_i16_f32_inreg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v21, v1 :: v_dual_mov_b32 v20, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s28 :: v_dual_mov_b32 v19, s29
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[10:13], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[6:9], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[2:5], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s27
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v5, s21
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s22 :: v_dual_mov_b32 v7, s23
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s17
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v10, s18 :: v_dual_mov_b32 v11, s19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v19, s1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v21, s3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 1, v14
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[0:3], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[4:7], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[8:11], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b128 v[0:1], v[18:21], off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v12, off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v15, off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v16, off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v17, off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) poison
   store volatile i1 %arg1, ptr addrspace(1) poison
   store volatile i8 %arg2, ptr addrspace(1) poison
@@ -1758,12 +1820,19 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_bf16_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_bf16_inreg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_bf16_inreg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %arg0, ptr addrspace(1) poison
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index d0a3811314029..81b8b36180746 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,CI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define void @void_func_i1(i1 %arg0) #0 {
 ; CIGFX89-LABEL: void_func_i1:
@@ -943,18 +944,31 @@ define void @void_func_v2i8(<2 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v2i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v2i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v2i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i8> %arg0, ptr addrspace(1) null
   ret void
 }
@@ -1022,20 +1036,35 @@ define void @void_func_v3i8(<3 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v3i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    s_mov_b64 s[0:1], 2
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b8 v2, off, s[0:3], 0
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v3i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v2, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v3i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v2, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i8> %arg0, ptr addrspace(1) null
   ret void
 }
@@ -1075,25 +1104,45 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i8> %arg0, ptr addrspace(1) null
   ret void
 }
@@ -1137,26 +1186,47 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v5i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    s_mov_b64 s[0:1], 4
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    buffer_store_b8 v4, off, s[0:3], 0
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v5i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v4, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v5i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v4, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i8> %arg0, ptr addrspace(1) null
   ret void
 }
@@ -1210,33 +1280,61 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i8> %arg0, ptr addrspace(1) null
   ret void
 }
@@ -1318,50 +1416,95 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v10, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v3, v9, v12
-; GFX11-NEXT:    v_or_b32_e32 v2, v8, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v5
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v6
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v13.l, v8.h, v13.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v12.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v8.l, v9.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v5.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v0.h, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v9, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v4, v5
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v9, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v4, v5
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i8> %arg0, ptr addrspace(1) null
   ret void
 }
@@ -1507,92 +1650,180 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_u8 v31, off, s32
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v17, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xff, v16
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_lshlrev_b16 v15, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v29, 8, v29
-; GFX11-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v25, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; GFX11-NEXT:    v_lshlrev_b16 v27, 8, v27
-; GFX11-NEXT:    v_and_b32_e32 v26, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b16 v21, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v23, 8, v23
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v19, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v11, v16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v15
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v2, v28, v29
-; GFX11-NEXT:    v_or_b32_e32 v3, v24, v25
-; GFX11-NEXT:    v_or_b32_e32 v6, v26, v27
-; GFX11-NEXT:    v_or_b32_e32 v7, v20, v21
-; GFX11-NEXT:    v_or_b32_e32 v10, v22, v23
-; GFX11-NEXT:    v_or_b32_e32 v14, v18, v19
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v10
-; GFX11-NEXT:    v_or_b32_e32 v4, v11, v14
-; GFX11-NEXT:    v_or_b32_e32 v3, v12, v13
-; GFX11-NEXT:    v_or_b32_e32 v2, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v17
-; GFX11-NEXT:    s_mov_b64 s[0:1], 16
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v31
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v30, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v7, v18, v1
-; GFX11-NEXT:    v_or_b32_e32 v1, v15, v16
-; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v31, off, s32
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v6.l, v7.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v12.l, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v16.l, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v17.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v18.l, v5.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v19.l, v7.h, v6.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v8.h, v8.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v10.l, v9.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v6.l, v11.l, v10.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v13.l, v12.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v10.l, v14.l, v13.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v11.l, v15.l, v14.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v17
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff, v18
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v13, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v15, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 16
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v31.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v9.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff, v10
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v9, v10
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v1, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v17, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v12, v19
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v31, off, s32
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v17, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v15, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v29, 8, v29
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v28, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v30, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v25, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v24, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v27, 8, v27
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v21, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v23, 8, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v19, 8, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v16, v17
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v28, v29
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v24, v25
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v26, v27
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v20, v21
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v22, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v18, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v11, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 16
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v31
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v30, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v18, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v15, v16
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store <32 x i8> %arg0, ptr addrspace(1) null
   ret void
 }
@@ -1847,18 +2078,32 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v2i24:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v2i24:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v2i24:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %elt0 = extractelement <2 x i24> %arg0, i32 0
   %elt1 = extractelement <2 x i24> %arg0, i32 1
   %add = add i24 %elt0, %elt1
@@ -2436,19 +2681,33 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
 ; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
 ; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_byval_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u8 v1, off, s32
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v1, off, s32
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0
   store { i8, i32 } %arg0.load, ptr addrspace(1) poison
   ret void
@@ -2532,30 +2791,55 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_byval_struct_i8_i32_x2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_u8 v1, off, s32 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:4 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_load_u8 v3, off, s32 offset:8 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v3, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    ds_store_b32 v0, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_byval_struct_i8_i32_x2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v1, off, s32 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v2, off, s32 offset:8 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v2, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_byval_struct_i8_i32_x2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v1, off, s32 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s32 offset:4 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v3, off, s32 offset:8 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v3, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0
   %arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1
   store volatile { i8, i32 } %arg0.load, ptr addrspace(1) poison
@@ -2760,52 +3044,99 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x5
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u16 v36, off, s32 offset:20
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
-; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b16 v36, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v32i32_i1_i8_i16_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x5
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v36, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v36
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v32i32_i1_i8_i16_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x5
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v32, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v33, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v34, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v35, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u16 v36, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 1, v32
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v36, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) poison
   store volatile i1 %arg1, ptr addrspace(1) poison
   store volatile i8 %arg2, ptr addrspace(1) poison
@@ -4143,95 +4474,185 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x10
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_u8 v33, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_u8 v34, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_u8 v35, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_u8 v36, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_u8 v37, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_u8 v38, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_u8 v39, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_u8 v48, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_u8 v49, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_u8 v50, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_u8 v51, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_u8 v52, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_u8 v53, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_u8 v54, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_u8 v55, off, s32 offset:4
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_v32i32_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x10
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v32, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v33, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v34, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v35, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v36, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v37, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v38, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v39, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v48, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v49, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v50, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v51, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v52, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v53, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v54, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v55, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_v32i32_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x10
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v32, off, s32 offset:64
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v33, off, s32 offset:60
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v34, off, s32 offset:56
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v35, off, s32 offset:52
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v36, off, s32 offset:48
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v37, off, s32 offset:44
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v38, off, s32 offset:40
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v39, off, s32 offset:36
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v48, off, s32 offset:32
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v49, off, s32 offset:28
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v50, off, s32 offset:24
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v51, off, s32 offset:20
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v52, off, s32 offset:16
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v53, off, s32 offset:12
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v54, off, s32 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v55, off, s32 offset:4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) poison
   store volatile <16 x i8> %arg1, ptr addrspace(1) poison
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 70c23f7b9ea33..08515fcc08e5e 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,CI %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX9 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define i1 @i1_func_void() #0 {
 ; GFX789-LABEL: i1_func_void:
@@ -1176,32 +1177,59 @@ define <16 x i8> @v16i8_func_void() #0 {
 ; GFX789-NEXT:    v_mov_b32_e32 v3, v18
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v16i8_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16
-; GFX11-NEXT:    v_mov_b32_e32 v8, v2
-; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
-; GFX11-NEXT:    v_mov_b32_e32 v2, v17
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v16i8_func_void:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_b128 v[16:19], off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v16i8_func_void:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v17
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) poison
   %val = load <16 x i8>, ptr addrspace(1) %ptr
   ret <16 x i8> %val
@@ -2310,17 +2338,30 @@ define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0)
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_sret_max_known_zero_bits:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 17, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    ds_store_b32 v0, v1
-; GFX11-NEXT:    ds_store_b32 v0, v0
-; GFX11-NEXT:    ds_store_b32 v0, v2
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_sret_max_known_zero_bits:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 17, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v2
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_sret_max_known_zero_bits:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 17, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v1
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v0
+; GFX11-FAKE16-NEXT:    ds_store_b32 v0, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %arg0.int = ptrtoint ptr addrspace(5) %arg0 to i32
 
   %lshr0 = lshr i32 %arg0.int, 16

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index a780c739dce7d..ca9cb456fa19f 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-SCRATCH %s
 
 declare hidden amdgpu_gfx void @external_void_func_i1(i1) #0
@@ -677,34 +678,63 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_i8_signext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    global_load_i8 v0, v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_signext:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    global_load_d16_i8 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_signext:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    global_load_i8 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i8_signext at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i8_signext at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -799,34 +829,63 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1038,34 +1097,63 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_i16_signext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_signext:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_signext:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i16_signext at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i16_signext at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -1160,34 +1248,63 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3044,38 +3161,71 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v2i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v2i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v2i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8 at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3466,40 +3616,76 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v5i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[5:6], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v5
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_mov_b32_e32 v4, v6
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v5i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_b64 v[3:4], v[0:1], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[5:6], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v5i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v5i8 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v5i8 at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_b64 v[5:6], v[0:1], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i8:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3621,42 +3807,80 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v8i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i8 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i8 at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v8i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v8i8 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v8i8 at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_b64 v[8:9], v[0:1], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v8i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v8i8 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v8i8 at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i8:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -3846,70 +4070,136 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v32i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 16
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i8 at abs32@hi
-; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i8 at abs32@lo
-; GFX11-NEXT:    global_load_b128 v[16:19], v[4:5], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v35
-; GFX11-NEXT:    v_mov_b32_e32 v8, v2
-; GFX11-NEXT:    v_mov_b32_e32 v12, v3
-; GFX11-NEXT:    v_mov_b32_e32 v20, v17
-; GFX11-NEXT:    v_mov_b32_e32 v24, v18
-; GFX11-NEXT:    v_dual_mov_b32 v28, v19 :: v_dual_mov_b32 v19, v34
-; GFX11-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v37
-; GFX11-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v32i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v32i8 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v32i8 at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_b128 v[32:35], v[0:1], off
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 16
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    global_load_b128 v[36:39], v[0:1], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v35
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v35
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v39
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v39
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v39.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v32i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v32i8 at abs32@hi
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v32i8 at abs32@lo
+; GFX11-FAKE16-NEXT:    global_load_b128 v[16:19], v[4:5], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v17
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, v19 :: v_dual_mov_b32 v19, v34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v37
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i8:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4062,41 +4352,77 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_i8_ret:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_u8 v0, v[40:41], off
-; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    global_store_b8 v[40:41], v0, off
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_ret:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v[40:41], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    global_store_b8 v[40:41], v0, off
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_i8_ret:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_i8_ret at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_i8_ret at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_u8 v0, v[40:41], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    global_store_b8 v[40:41], v0, off
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4224,48 +4550,92 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v2i8_ret:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_u16 v0, v[40:41], off
-; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b16 v[40:41], v0, off
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v2i8_ret:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[40:41], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT:    global_store_b16 v[40:41], v0, off
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v2i8_ret:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[40:41], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v[40:41], v0, off
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4389,67 +4759,114 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    s_mov_b32 s32, s33
-; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
-; GFX10-NEXT:    global_store_byte v[3:4], v2, off
-; GFX10-NEXT:    global_store_short v[40:41], v0, off
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_mov_b32 s33, s34
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_external_void_func_v3i8_ret:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
-; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 2
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
-; GFX11-NEXT:    global_store_b16 v[40:41], v3, off
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_mov_b32 s32, s33
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
+; GFX10-NEXT:    global_store_byte v[3:4], v2, off
+; GFX10-NEXT:    global_store_short v[40:41], v0, off
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-NEXT:    s_mov_b32 s33, s34
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v3i8_ret:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b8 v[3:4], v2, off
+; GFX11-TRUE16-NEXT:    global_store_b16 v[40:41], v0, off
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v3i8_ret:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v2, off
+; GFX11-FAKE16-NEXT:    global_store_b16 v[40:41], v3, off
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4593,57 +5010,109 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v4i8_ret:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
-; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v[40:41], v0, off
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v4i8_ret:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    global_store_b32 v[40:41], v0, off
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v4i8_ret:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    global_store_b32 v[40:41], v0, off
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -4798,62 +5267,120 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v5i8_ret:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[5:6], v[40:41], off
-; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v5
-; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_mov_b32_e32 v4, v6
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
-; GFX11-NEXT:    global_store_b32 v[40:41], v2, off
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v5i8_ret:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_b64 v[3:4], v[40:41], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[5:6], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX11-TRUE16-NEXT:    global_store_b32 v[40:41], v2, off
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v5i8_ret:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_b64 v[5:6], v[40:41], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v6
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX11-FAKE16-NEXT:    global_store_b32 v[40:41], v2, off
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5023,69 +5550,137 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v8i8_ret:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[0:1], v[40:41], off
-; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v8
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-NEXT:    global_store_b64 v[40:41], v[0:1], off
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v8i8_ret:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    global_load_b64 v[8:9], v[40:41], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v9.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v5.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    global_store_b64 v[40:41], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v8i8_ret:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v41, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[40:41], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v8
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-FAKE16-NEXT:    global_store_b64 v[40:41], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -5395,155 +5990,307 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v32i8_ret:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33
-; GFX11-NEXT:    v_mov_b32_e32 v40, 0
-; GFX11-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16
-; GFX11-NEXT:    v_mov_b32_e32 v43, 0
-; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
-; GFX11-NEXT:    global_load_b128 v[0:3], v[40:41], off
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
-; GFX11-NEXT:    global_load_b128 v[16:19], v[42:43], off
-; GFX11-NEXT:    v_writelane_b32 v44, s30, 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v44, s31, 1
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
-; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v35
-; GFX11-NEXT:    v_mov_b32_e32 v8, v2
-; GFX11-NEXT:    v_mov_b32_e32 v12, v3
-; GFX11-NEXT:    v_mov_b32_e32 v20, v17
-; GFX11-NEXT:    v_mov_b32_e32 v24, v18
-; GFX11-NEXT:    v_dual_mov_b32 v28, v19 :: v_dual_mov_b32 v19, v34
-; GFX11-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v37
-; GFX11-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
-; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
-; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
-; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_or_b32_e32 v13, v14, v13
-; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v9
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v28
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v29
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v31
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v25
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v24
-; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v1
-; GFX11-NEXT:    v_or_b32_e32 v7, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v6, v12, v6
-; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v26
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v27
-; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v21
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v20
-; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v23
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v22
-; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v17
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v16
-; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v19
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v18
-; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, v13, v12
-; GFX11-NEXT:    v_or_b32_e32 v11, v15, v14
-; GFX11-NEXT:    v_or_b32_e32 v12, v17, v16
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_or_b32_e32 v10, v1, v3
-; GFX11-NEXT:    v_or_b32_e32 v9, v7, v8
-; GFX11-NEXT:    v_or_b32_e32 v8, v13, v14
-; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
-; GFX11-NEXT:    v_or_b32_e32 v3, v0, v2
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v[42:43], v[7:10], off
-; GFX11-NEXT:    global_store_b128 v[40:41], v[3:6], off
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v32i8_ret:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:12
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s33 offset:8
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s33
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v43, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s0, 2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
+; GFX11-TRUE16-NEXT:    global_load_b128 v[32:35], v[40:41], off
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX11-TRUE16-NEXT:    global_load_b128 v[36:39], v[42:43], off
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v44, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v35
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v35
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 24, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v37
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v39
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v39
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v39
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v32.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v33.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v34.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v35.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v36.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v37.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v38.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v39.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v13.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v9.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v12.l, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v15.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v7.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v9.l, v5.h, v4.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v4.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v29.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v25.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v31.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v12, v6
+; GFX11-TRUE16-NEXT:    v_or_b16 v8.l, v2.h, v1.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 0xffff, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v27.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v21.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v23.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 8, v17.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 8, v19.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v1.l, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v3.l, v3.h, v3.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v7.l, v7.h, v7.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v10, v8
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v12, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v1, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v3, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v11, v13
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b128 v[42:43], v[7:10], off
+; GFX11-TRUE16-NEXT:    global_store_b128 v[40:41], v[3:6], off
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s33
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s33 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s33 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v44, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v32i8_ret:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:12
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v41, s33 offset:8
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v42, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v43, s33
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v43, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret at abs32@hi
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v[40:41], off
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret at abs32@lo
+; GFX11-FAKE16-NEXT:    global_load_b128 v[16:19], v[42:43], off
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v44, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v35
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v17
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v28, v19 :: v_dual_mov_b32 v19, v34
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v37
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v9, 8, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v11, 8, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v7, 8, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v10, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v6, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v13, 8, v15
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v14, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v28
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v29
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v31
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v30
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v25
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v7, v1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, v12, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v26
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v27
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v10, 8, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v20
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v12, 8, v23
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v22
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v14, 8, v17
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v16
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v16, 8, v19
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 0xff, v18
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v11, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v13, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v15, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v17, v16
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v1, v3
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v7, v8
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v13, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v0, v2
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b128 v[42:43], v[7:10], off
+; GFX11-FAKE16-NEXT:    global_store_b128 v[40:41], v[3:6], off
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v43, off, s33
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v42, off, s33 offset:4
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v41, off, s33 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v44, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i8_ret:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8474,38 +9221,71 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u8 v0, v1, s[0:1]
-; GFX11-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_u8 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32 at abs32@lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8611,37 +9391,70 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_byval_struct_i8_i32 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_byval_struct_i8_i32 at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, s33
-; GFX11-NEXT:    scratch_store_b32 off, v1, s33 offset:4
-; GFX11-NEXT:    v_mov_b32_e32 v0, s33
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_byval_struct_i8_i32 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_byval_struct_i8_i32 at abs32@lo
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b8 off, v0, s33
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s33
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_byval_struct_i8_i32 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_byval_struct_i8_i32 at abs32@lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b8 off, v0, s33
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s33
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_byval_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8768,45 +9581,86 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
-; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    s_add_i32 s2, s33, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 off, v0, s33
-; GFX11-NEXT:    scratch_store_b32 off, v1, s33 offset:4
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s33
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_u8 v0, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v1, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 3
+; GFX11-TRUE16-NEXT:    s_add_i32 s2, s33, 8
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 8
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b8 off, v0, s33
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s33 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    global_store_b32 v[0:1], v1, off dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-FAKE16-NEXT:    s_add_i32 s2, s33, 8
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b8 off, v0, s33
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v1, s33 offset:4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at abs32@lo
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_load_u8 v0, off, s33 offset:8
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s33 offset:12
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    global_store_b32 v[0:1], v1, off dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -8969,52 +9823,99 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_v16i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[0:1]
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16i8 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16i8 at abs32@lo
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GFX11-NEXT:    v_mov_b32_e32 v4, v1
-; GFX11-NEXT:    v_mov_b32_e32 v8, v2
-; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
-; GFX11-NEXT:    v_dual_mov_b32 v1, v16 :: v_dual_mov_b32 v2, v17
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_v16i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[16:19], v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_v16i8 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_v16i8 at abs32@lo
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v18.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v19.l
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_v16i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_v16i8 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_v16i8 at abs32@lo
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v16 :: v_dual_mov_b32 v2, v17
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i8:
 ; GFX10-SCRATCH:       ; %bb.0:
@@ -17319,32 +18220,61 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    s_mov_b32 s32, s33
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_mov_b32 s33, s0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX11-TRUE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-TRUE16-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s30, v40, 1
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-TRUE16-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-TRUE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s33
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s32
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, external_void_func_bf16 at abs32@hi
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, external_void_func_bf16 at abs32@lo
+; GFX11-FAKE16-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-FAKE16-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, s33
+; GFX11-FAKE16-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-FAKE16-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-FAKE16-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s33, s0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 3685ac18ab4d4..891e9b8673d91 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -1,9 +1,13 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND,WORKAROUND-TRUE16-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND,WORKAROUND-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,WORKAROUND,WORKAROUND-FAKE16 %s
 
 ; Does not apply to wave64
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
 
 ; Does not apply to gfx1101
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1101 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,NOWORKAROUND %s
@@ -83,7 +87,8 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 {
 }
 
 ; GCN-LABEL: {{^}}queue_ptr:
-; GCN: global_load_u8 v{{[0-9]+}},
+; WORKAROUND-TRUE16-SDAG: global_load_d16_u8
+; WORKAROUND-FAKE16: global_load_u8 v{{[0-9]+}},
 
 ; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15
 ; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s4
@@ -124,9 +129,13 @@ define amdgpu_kernel void @queue_ptr() #1 {
 ; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9
 ; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10
 
-; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
-; GCN: global_load_u8 v{{[0-9]+}},
-; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
+; WORKAROUND-TRUE16-SDAG: global_load_d16_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
+; WORKAROUND-TRUE16-SDAG: global_load_d16_u8 v{{[0-9]+}},
+; WORKAROUND-TRUE16-SDAG: global_load_d16_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
+
+; WORKAROUND-FAKE16: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1]
+; WORKAROUND-FAKE16: global_load_u8 v{{[0-9]+}},
+; WORKAROUND-FAKE16: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5]
 
 ; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7

diff  --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index a33aeac255372..28245c538a04c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 ; Test using saddr addressing mode of global_*load_* flat instructions.
 
@@ -2284,17 +2287,35 @@ define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: global_load_saddr_i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: global_load_saddr_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: global_load_saddr_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -2309,17 +2330,35 @@ define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %s
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_i16_immneg128:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: global_load_saddr_i16_immneg128:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: global_load_saddr_i16_immneg128:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: global_load_saddr_i16_immneg128:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16_immneg128:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16_immneg128:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_i16_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -2335,17 +2374,35 @@ define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: global_load_saddr_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: global_load_saddr_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: global_load_saddr_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_f16:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_f16:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_f16:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load half, ptr addrspace(1) %gep0
@@ -2359,17 +2416,35 @@ define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %s
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: global_load_saddr_f16_immneg128:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: global_load_saddr_f16_immneg128:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: global_load_saddr_f16_immneg128:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: global_load_saddr_f16_immneg128:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_f16_immneg128:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_f16_immneg128:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-LABEL: global_load_saddr_f16_immneg128:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -3841,11 +3916,17 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_undef_hi:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_undef_hi:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -3873,11 +3954,17 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -3912,12 +3999,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_zero_hi:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -3951,12 +4045,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4216,12 +4317,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_undef_hi:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -4249,12 +4357,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
@@ -4289,12 +4404,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3]
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3]
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3]
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %load = load i16, ptr addrspace(1) %gep0
@@ -4328,12 +4450,19 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX12-SDAG-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:-128
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
   %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128

diff  --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 10573aad38a51..f767511370eee 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,CI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ; half args should be promoted to float for CI and lower.
 
@@ -961,15 +962,25 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
 ; CIVI-NEXT:    flat_store_short v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_load_store_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_load_store_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_load_store_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   store half %val, ptr addrspace(1) %out
   ret void
@@ -1086,16 +1097,27 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_f16_to_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   %cvt = fpext half %val to float
   store float %cvt, ptr addrspace(1) %out
@@ -1140,19 +1162,33 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v2f16_to_v2f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v2, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v2f16_to_v2f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v2f16_to_v2f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v2, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   %cvt = fpext <2 x half> %val to <2 x float>
   store <2 x float> %cvt, ptr addrspace(1) %out
@@ -1199,20 +1235,35 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v3f16_to_v3f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v3, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v4
-; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v3f16_to_v3f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v3, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v4.l
+; GFX11-TRUE16-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v3f16_to_v3f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v3, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX11-FAKE16-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <3 x half>, ptr addrspace(1) %in
   %cvt = fpext <3 x half> %val to <3 x float>
   store <3 x float> %cvt, ptr addrspace(1) %out
@@ -1262,22 +1313,39 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v4f16_to_v4f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v4f16_to_v4f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v5.l
+; GFX11-TRUE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v4f16_to_v4f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v5
+; GFX11-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <4 x half>, ptr addrspace(1) %in
   %cvt = fpext <4 x half> %val to <4 x float>
   store <4 x float> %cvt, ptr addrspace(1) %out
@@ -1347,29 +1415,53 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v8f16_to_v8f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v12, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v12, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, v5
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v12, v[8:11], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v12, v[4:7], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v8f16_to_v8f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v12, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v10, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v8, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v11, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v9, v9.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v5, v0.l
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b128 v12, v[8:11], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v12, v[4:7], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v8f16_to_v8f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v12, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v10, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v11, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b128 v12, v[8:11], s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    global_store_b128 v12, v[4:7], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <8 x half>, ptr addrspace(1) %in
   %cvt = fpext <8 x half> %val to <8 x float>
   store <8 x float> %cvt, ptr addrspace(1) %out
@@ -1491,46 +1583,87 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v16f16_to_v16f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v20, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v20, s[2:3]
-; GFX11-NEXT:    global_load_b128 v[4:7], v20, s[2:3] offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v18, v7
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v14, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v12, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v5
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v19, v7
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v17, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v15, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v5
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_store_b128 v20, v[16:19], s[0:1] offset:48
-; GFX11-NEXT:    global_store_b128 v20, v[0:3], s[0:1] offset:32
-; GFX11-NEXT:    global_store_b128 v20, v[12:15], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v20, v[8:11], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v16f16_to_v16f32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v20, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v20, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v20, s[2:3] offset:16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v10, v1.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v18, v7.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v16, v6.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v8, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v14, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v12, v2.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v4.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v19, v7.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v17, v6.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v11, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v15, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v4.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v13, v13.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v9, v9.l
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    global_store_b128 v20, v[16:19], s[0:1] offset:48
+; GFX11-TRUE16-NEXT:    global_store_b128 v20, v[0:3], s[0:1] offset:32
+; GFX11-TRUE16-NEXT:    global_store_b128 v20, v[12:15], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v20, v[8:11], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v16f16_to_v16f32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v20, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v20, s[2:3] offset:16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v10, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v18, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v14, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v12, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v19, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v17, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v11, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v15, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v4
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    global_store_b128 v20, v[16:19], s[0:1] offset:48
+; GFX11-FAKE16-NEXT:    global_store_b128 v20, v[0:3], s[0:1] offset:32
+; GFX11-FAKE16-NEXT:    global_store_b128 v20, v[12:15], s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    global_store_b128 v20, v[8:11], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <16 x half>, ptr addrspace(1) %in
   %cvt = fpext <16 x half> %val to <16 x float>
   store <16 x float> %cvt, ptr addrspace(1) %out
@@ -1556,18 +1689,31 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
 ; CIVI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CIVI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_f16_to_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_f16_to_f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_f16_to_f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v2, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   %cvt = fpext half %val to double
   store double %cvt, ptr addrspace(1) %out
@@ -1616,22 +1762,39 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v2f16_to_v2f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v4, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v2f16_to_v2f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v4, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-TRUE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v2f16_to_v2f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v4, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) %in
   %cvt = fpext <2 x half> %val to <2 x double>
   store <2 x double> %cvt, ptr addrspace(1) %out
@@ -1694,26 +1857,47 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v3f16_to_v3f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v6, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v3f16_to_v3f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v6, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v3f16_to_v3f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v6, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <3 x half>, ptr addrspace(1) %in
   %cvt = fpext <3 x half> %val to <3 x double>
   store <3 x double> %cvt, ptr addrspace(1) %out
@@ -1781,30 +1965,55 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v4f16_to_v4f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v4f16_to_v4f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v8, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v4f16_to_v4f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v8, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <4 x half>, ptr addrspace(1) %in
   %cvt = fpext <4 x half> %val to <4 x double>
   store <4 x double> %cvt, ptr addrspace(1) %out
@@ -1910,39 +2119,73 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v8f16_to_v8f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v16, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v17, v5
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v9
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v3
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v8f16_to_v8f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v0.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v1.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v8, v2.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v17, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v9.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v7.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[12:13], v3
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[14:15], v6
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
+; GFX11-TRUE16-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
+; GFX11-TRUE16-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v8f16_to_v8f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v17, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v9
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[12:13], v3
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[14:15], v6
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
+; GFX11-FAKE16-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
+; GFX11-FAKE16-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <8 x half>, ptr addrspace(1) %in
   %cvt = fpext <8 x half> %val to <8 x double>
   store <8 x double> %cvt, ptr addrspace(1) %out
@@ -2137,66 +2380,127 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; VI-NEXT:    flat_store_dwordx4 v[13:14], v[5:8]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_extload_v16f16_to_v16f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v32, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v32, s[2:3]
-; GFX11-NEXT:    global_load_b128 v[4:7], v32, s[2:3] offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v15, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v14, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v13, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v12, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v18, v4
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v22, v5
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v23
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v34, v11
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[28:29], v22
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[30:31], v10
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[24:25], v18
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[26:27], v11
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v33, v9
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[20:21], v15
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[22:23], v7
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[16:17], v14
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[18:19], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v13
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v3
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v34
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v33
-; GFX11-NEXT:    s_clause 0x7
-; GFX11-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:80
-; GFX11-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:64
-; GFX11-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:112
-; GFX11-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:96
-; GFX11-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX11-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX11-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_extload_v16f16_to_v16f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v32, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v32, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v32, s[2:3] offset:16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v10, v1.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v15, v7.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v14, v6.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v13, v3.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v12, v2.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v18, v4.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v22, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v10, v23.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v34, v11.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v11, v19.l
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v7.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v6.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v8, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[28:29], v22
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[30:31], v10
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[24:25], v18
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[26:27], v11
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v33, v9.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[20:21], v15
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[22:23], v7
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[16:17], v14
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[18:19], v6
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[12:13], v13
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[14:15], v3
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[6:7], v34
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v33
+; GFX11-TRUE16-NEXT:    s_clause 0x7
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:80
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:64
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:112
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:48
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_extload_v16f16_to_v16f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v32, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v32, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v32, s[2:3] offset:16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v10, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v15, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v14, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v13, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v12, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v18, v4
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v22, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v10, v23
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v34, v11
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v11, v19
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[28:29], v22
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[30:31], v10
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[24:25], v18
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[26:27], v11
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v33, v9
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[20:21], v15
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[22:23], v7
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[16:17], v14
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[18:19], v6
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[12:13], v13
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[14:15], v3
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[6:7], v34
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v33
+; GFX11-FAKE16-NEXT:    s_clause 0x7
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:80
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:64
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:112
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:48
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <16 x half>, ptr addrspace(1) %in
   %cvt = fpext <16 x half> %val to <16 x double>
   store <16 x double> %cvt, ptr addrspace(1) %out
@@ -2221,16 +2525,27 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
 ; CIVI-NEXT:    flat_store_short v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_truncstore_f32_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_truncstore_f32_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_truncstore_f32_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %in
   %cvt = fptrunc float %val to half
   store half %cvt, ptr addrspace(1) %out
@@ -2277,19 +2592,33 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_truncstore_v2f32_to_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_truncstore_v2f32_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v3, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_truncstore_v2f32_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <2 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <2 x float> %val to <2 x half>
   store <2 x half> %cvt, ptr addrspace(1) %out
@@ -2348,22 +2677,39 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_truncstore_v3f32_to_v3f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b96 v[0:2], v3, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1] offset:4
-; GFX11-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_truncstore_v3f32_to_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b96 v[0:2], v3, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v3, v0, s[0:1] offset:4
+; GFX11-TRUE16-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_truncstore_v3f32_to_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b96 v[0:2], v3, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b16 v3, v2, s[0:1] offset:4
+; GFX11-FAKE16-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <3 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <3 x float> %val to <3 x half>
   store <3 x half> %cvt, ptr addrspace(1) %out
@@ -2417,22 +2763,39 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_truncstore_v4f32_to_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v2, v3
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_truncstore_v4f32_to_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    global_store_b64 v4, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_truncstore_v4f32_to_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v2, v3
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v5
+; GFX11-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <4 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <4 x float> %val to <4 x half>
   store <4 x half> %cvt, ptr addrspace(1) %out
@@ -2512,31 +2875,57 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_truncstore_v8f32_to_v8f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v8, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[2:3] offset:16
-; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX11-NEXT:    v_pack_b32_f16 v3, v2, v3
-; GFX11-NEXT:    v_pack_b32_f16 v2, v0, v1
-; GFX11-NEXT:    v_pack_b32_f16 v1, v6, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v4, v5
-; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_truncstore_v8f32_to_v8f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v8, s[2:3] offset:16
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v8, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v7
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v6
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.l, v5
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.l, v4
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_truncstore_v8f32_to_v8f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v8, s[2:3] offset:16
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v8, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v3, v2, v3
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v2, v0, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v6, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v4, v5
+; GFX11-FAKE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <8 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <8 x float> %val to <8 x half>
   store <8 x half> %cvt, ptr addrspace(1) %out
@@ -2678,48 +3067,91 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_truncstore_v16f32_to_v16f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v16, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[2:3] offset:16
-; GFX11-NEXT:    global_load_b128 v[4:7], v16, s[2:3]
-; GFX11-NEXT:    global_load_b128 v[8:11], v16, s[2:3] offset:48
-; GFX11-NEXT:    global_load_b128 v[12:15], v16, s[2:3] offset:32
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v17, v5
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v18, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v11
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v10
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v10, v15
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v11, v14
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; GFX11-NEXT:    v_pack_b32_f16 v3, v2, v3
-; GFX11-NEXT:    v_pack_b32_f16 v2, v0, v1
-; GFX11-NEXT:    v_pack_b32_f16 v1, v6, v7
-; GFX11-NEXT:    v_pack_b32_f16 v7, v5, v4
-; GFX11-NEXT:    v_pack_b32_f16 v6, v8, v9
-; GFX11-NEXT:    v_pack_b32_f16 v5, v11, v10
-; GFX11-NEXT:    v_pack_b32_f16 v4, v12, v13
-; GFX11-NEXT:    v_pack_b32_f16 v0, v18, v17
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: global_truncstore_v16f32_to_v16f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x3
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v17, s[2:3] offset:16
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v17, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b128 v[8:11], v17, s[2:3] offset:48
+; GFX11-TRUE16-NEXT:    global_load_b128 v[12:15], v17, s[2:3] offset:32
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v16.l, v5
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v16.h, v4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.l, v11
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.h, v10
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.l, v9
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.h, v8
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v8.l, v15
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v8.h, v14
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v9.l, v13
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v9.h, v12
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v7
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v6
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v7, v4.h, v4.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v6, v5.h, v5.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v5, v8.h, v8.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v9.h, v9.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v16.h, v16.l
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_store_b128 v17, v[4:7], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v17, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: global_truncstore_v16f32_to_v16f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x3
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v16, s[2:3] offset:16
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v16, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_b128 v[8:11], v16, s[2:3] offset:48
+; GFX11-FAKE16-NEXT:    global_load_b128 v[12:15], v16, s[2:3] offset:32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v17, v5
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v18, v4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v11
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v10
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v10, v15
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v11, v14
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v3, v2, v3
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v2, v0, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v6, v7
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v7, v5, v4
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v6, v8, v9
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v5, v11, v10
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v4, v12, v13
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v18, v17
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
+; GFX11-FAKE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load <16 x float>, ptr addrspace(1) %in
   %cvt = fptrunc <16 x float> %val to <16 x half>
   store <16 x half> %cvt, ptr addrspace(1) %out
@@ -2763,18 +3195,31 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fadd_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_f16_e64 v1, s2, s3
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fadd_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, s2, s3
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fadd_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v1, s2, s3
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
    %add = fadd half %a, %b
    store half %add, ptr addrspace(1) %out, align 4
    ret void
@@ -3062,15 +3507,25 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
 ; CIVI-NEXT:    flat_store_short v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_bitcast_from_half:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_bitcast_from_half:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bitcast_from_half:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, ptr addrspace(1) %out
@@ -3094,15 +3549,25 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
 ; CIVI-NEXT:    flat_store_short v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_bitcast_to_half:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_bitcast_to_half:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_bitcast_to_half:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 6c472123ee766..d28f0a190e117 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -5,7 +5,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-DL,GFX11-DL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-DL,GFX11-DL-FAKE16 %s
 
 define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32:
@@ -1155,52 +1156,96 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
-; GFX11-DL-LABEL: idot4_acc16_vecMul:
-; GFX11-DL:       ; %bb.0: ; %entry
-; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT:    s_clause 0x1
-; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 8, v1
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 8, v0
-; GFX11-DL-NEXT:    v_bfe_i32 v6, v0, 0, 8
-; GFX11-DL-NEXT:    v_bfe_i32 v7, v1, 0, 8
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
-; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 8, v1
-; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 8, v0
-; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
-; GFX11-DL-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
-; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
-; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
-; GFX11-DL-NEXT:    s_endpgm
+; GFX11-DL-TRUE16-LABEL: idot4_acc16_vecMul:
+; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v3, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v6.h, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v8.h, 8, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.h
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v1.h, 8, v1.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v9, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v6, v6, v8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v6.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v2, v1
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v6.h
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v3, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DL-FAKE16-LABEL: idot4_acc16_vecMul:
+; GFX11-DL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-DL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-FAKE16-NEXT:    global_load_u16 v3, v2, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-FAKE16-NEXT:    v_ashrrev_i16 v4, 8, v1
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-FAKE16-NEXT:    v_ashrrev_i16 v5, 8, v0
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v6, v0, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v7, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-FAKE16-NEXT:    v_ashrrev_i16 v6, 8, v1
+; GFX11-DL-FAKE16-NEXT:    v_ashrrev_i16 v7, 8, v0
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-FAKE16-NEXT:    global_store_b16 v2, v0, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -3378,42 +3423,87 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_store_dword v1, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
-; GFX11-DL-LABEL: idot4_nonstandard_signed:
-; GFX11-DL:       ; %bb.0: ; %entry
-; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT:    s_clause 0x1
-; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-DL-NEXT:    v_mul_lo_u16 v2, v2, v3
-; GFX11-DL-NEXT:    v_bfe_i32 v3, v4, 0, 8
-; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v5
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX11-DL-NEXT:    v_bfe_i32 v5, v6, 0, 8
-; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX11-DL-NEXT:    v_mad_u16 v2, v4, v3, v2
-; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_mad_u16 v2, v6, v5, v2
-; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v2
-; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[4:5]
-; GFX11-DL-NEXT:    s_endpgm
+; GFX11-DL-TRUE16-LABEL: idot4_nonstandard_signed:
+; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v0, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.h, v2.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v3.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-DL-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DL-FAKE16-LABEL: idot4_nonstandard_signed:
+; GFX11-DL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-DL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v3
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v7
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v2, v4, v3, v2
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v2, v6, v5, v2
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v0, v1, v0, v2
+; GFX11-DL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-DL-FAKE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 75e72a72bebb1..82d62910bcb00 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -5,7 +5,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-DL,GFX11-DL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-DL,GFX11-DL-FAKE16 %s
 
 define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32:
@@ -1665,38 +1666,77 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
-; GFX11-DL-LABEL: notdot4_mixedtypes:
-; GFX11-DL:       ; %bb.0: ; %entry
-; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT:    s_clause 0x1
-; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT:    v_bfe_i32 v6, v1, 0, 8
-; GFX11-DL-NEXT:    v_bfe_i32 v7, v0, 0, 8
-; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
-; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0302
-; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
-; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
-; GFX11-DL-NEXT:    s_endpgm
+; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes:
+; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v2.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v4, v4, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v2, v3, v3, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_dot4_u32_u8 v0, v2, v1, v0
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v5, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes:
+; GFX11-DL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-DL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v7, v0, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-DL-FAKE16-NEXT:    global_load_u16 v3, v2, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0xc0c0302
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v1, v1, v1, 0xc0c0302
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_dot4_u32_u8 v0, v1, v0, v3
+; GFX11-DL-FAKE16-NEXT:    global_store_b16 v2, v0, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1921,43 +1961,86 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
-; GFX11-DL-LABEL: notdot4_mixedtypes2:
-; GFX11-DL:       ; %bb.0: ; %entry
-; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT:    s_clause 0x1
-; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_and_b32_e32 v9, 0xff, v0
-; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-DL-NEXT:    v_bfe_i32 v8, v1, 0, 8
-; GFX11-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX11-DL-NEXT:    v_bfe_i32 v4, v6, 0, 8
-; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v7
-; GFX11-DL-NEXT:    v_mad_u16 v3, v8, v9, v3
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
-; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
-; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
-; GFX11-DL-NEXT:    s_endpgm
+; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes2:
+; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v4, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v1.h, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v5, 0, 8
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v0.h, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v4, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes2:
+; GFX11-DL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-DL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v0
+; GFX11-DL-FAKE16-NEXT:    global_load_u16 v3, v2, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-FAKE16-NEXT:    v_bfe_i32 v4, v6, 0, 8
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v3, v8, v9, v3
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-FAKE16-NEXT:    global_store_b16 v2, v0, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2358,50 +2441,87 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
-; GFX11-DL-LABEL: udot4_acc16_vecMul:
-; GFX11-DL:       ; %bb.0: ; %entry
-; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT:    s_clause 0x1
-; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
-; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[4:5]
-; GFX11-DL-NEXT:    v_lshrrev_b16 v4, 8, v1
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT:    v_lshrrev_b16 v5, 8, v0
-; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v0
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
-; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
-; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v9
-; GFX11-DL-NEXT:    v_and_b32_e32 v7, 0xff, v8
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
-; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[4:5]
-; GFX11-DL-NEXT:    s_endpgm
+; GFX11-DL-TRUE16-LABEL: udot4_acc16_vecMul:
+; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v3, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v4.h, 8, v1.l
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v5.h, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.h
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v7.l
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v4.h
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v3, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DL-FAKE16-LABEL: udot4_acc16_vecMul:
+; GFX11-DL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-DL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
+; GFX11-DL-FAKE16-NEXT:    global_load_u16 v3, v2, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b16 v4, 8, v1
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b16 v5, 8, v0
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v9
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-FAKE16-NEXT:    global_store_b16 v2, v0, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2596,52 +2716,95 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[6:7]
 ; GFX10-DL-NEXT:    s_endpgm
 ;
-; GFX11-DL-LABEL: udot4_acc8_vecMul:
-; GFX11-DL:       ; %bb.0: ; %entry
-; GFX11-DL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT:    s_clause 0x1
-; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[0:1]
-; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[4:5]
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-DL-NEXT:    v_lshrrev_b16 v8, 8, v1
-; GFX11-DL-NEXT:    v_lshrrev_b16 v9, 8, v0
-; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
-; GFX11-DL-NEXT:    v_mul_lo_u16 v5, v5, v6
-; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v4, v7
-; GFX11-DL-NEXT:    v_mul_lo_u16 v8, v8, v9
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-NEXT:    v_or_b32_e32 v6, v6, v5
-; GFX11-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-DL-NEXT:    v_or_b32_e32 v6, v8, v6
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
-; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v6
-; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v7, v0
-; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[4:5]
-; GFX11-DL-NEXT:    s_endpgm
+; GFX11-DL-TRUE16-LABEL: udot4_acc8_vecMul:
+; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_u8 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v3.l
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v3.h, v4.h
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b16 v1.h, 8, v4.l
+; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v4.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v0.h, v1.h
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v0.h
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-DL-TRUE16-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.h, v4.h, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    global_store_b8 v5, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-DL-FAKE16-LABEL: udot4_acc8_vecMul:
+; GFX11-DL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-DL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-DL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    s_clause 0x1
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-FAKE16-NEXT:    global_load_b32 v0, v0, s[2:3]
+; GFX11-DL-FAKE16-NEXT:    global_load_u8 v3, v2, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b16 v8, 8, v1
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b16 v9, 8, v0
+; GFX11-DL-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-FAKE16-NEXT:    v_mul_lo_u16 v5, v5, v6
+; GFX11-DL-FAKE16-NEXT:    v_mul_lo_u16 v6, v4, v7
+; GFX11-DL-FAKE16-NEXT:    v_mul_lo_u16 v8, v8, v9
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v5
+; GFX11-DL-FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-DL-FAKE16-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v6
+; GFX11-DL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-FAKE16-NEXT:    v_mad_u16 v0, v4, v7, v0
+; GFX11-DL-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-FAKE16-NEXT:    global_store_b8 v2, v0, s[4:5]
+; GFX11-DL-FAKE16-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 44b1bb25bc057..7c03fe9af5a13 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -50,160 +51,313 @@ bb:
 
 ; FIXME: This generates "instid1(/* invalid instid value */)".
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
-; GFX11-LABEL: f2:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_mov_b64 s[16:17], s[4:5]
-; GFX11-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-NEXT:    s_load_b32 s19, s[16:17], 0x24
-; GFX11-NEXT:    s_mov_b32 s12, s13
-; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s0, -1
-; GFX11-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v0, s19, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_13
-; GFX11-NEXT:  ; %bb.1: ; %bb14
-; GFX11-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
-; GFX11-NEXT:    s_mov_b32 s18, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_bitcmp1_b32 s21, 0
-; GFX11-NEXT:    s_cselect_b32 s24, -1, 0
-; GFX11-NEXT:    s_bitcmp0_b32 s21, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
-; GFX11-NEXT:  ; %bb.2: ; %bb15
-; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s13, s14
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s21, s14
-; GFX11-NEXT:    s_mov_b32 s14, s15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s14, s21
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_cbranch_execz .LBB2_4
-; GFX11-NEXT:    s_branch .LBB2_12
-; GFX11-NEXT:  .LBB2_3:
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
-; GFX11-NEXT:  .LBB2_4: ; %bb16
-; GFX11-NEXT:    s_load_b32 s0, s[16:17], 0x54
-; GFX11-NEXT:    s_bitcmp1_b32 s23, 0
-; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-NEXT:    s_and_b32 s1, s23, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, -1
-; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
-; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
-; GFX11-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_hi_u32 s0, s29, s28
-; GFX11-NEXT:    s_mul_i32 s1, s29, s28
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_or_b32 s0, s0, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s30
-; GFX11-NEXT:    s_mul_i32 s0, s0, s22
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_i32 s0, s0, s20
-; GFX11-NEXT:    s_or_b32 s0, s19, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
-; GFX11-NEXT:    s_mov_b32 s0, s1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[20:21]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB2_6: ; %bb18
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
-; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
-; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s13, s19, s13
-; GFX11-NEXT:    s_bitcmp1_b32 s13, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 0x100, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s13, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
-; GFX11-NEXT:  ; %bb.7: ; %Flow
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB2_8: ; %Flow12
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
-; GFX11-NEXT:  ; %bb.9:
-; GFX11-NEXT:    s_xor_b32 s0, s8, -1
-; GFX11-NEXT:  .LBB2_10: ; %bb17
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_10
-; GFX11-NEXT:  ; %bb.11: ; %Flow6
-; GFX11-NEXT:    s_mov_b32 s18, -1
-; GFX11-NEXT:  .LBB2_12: ; %Flow11
-; GFX11-NEXT:    s_and_b32 s20, s2, exec_lo
-; GFX11-NEXT:    s_or_not1_b32 s0, s18, exec_lo
-; GFX11-NEXT:  .LBB2_13: ; %Flow9
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-NEXT:    s_and_saveexec_b32 s3, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_15
-; GFX11-NEXT:  ; %bb.14: ; %bb43
-; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s13, s14
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s14, s15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_or_b32 s20, s20, exec_lo
-; GFX11-NEXT:  .LBB2_15: ; %Flow14
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s20
-; GFX11-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
-; GFX11-NEXT:    ; divergent unreachable
-; GFX11-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: f2:
+; GFX11-TRUE16:       ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-TRUE16-NEXT:    s_load_b32 s19, s[16:17], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s13
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_lo_u32 v0, s19, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_13
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %bb14
+; GFX11-TRUE16-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s21, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s24, -1, 0
+; GFX11-TRUE16-NEXT:    s_bitcmp0_b32 s21, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB2_3
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %bb15
+; GFX11-TRUE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-TRUE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s14
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s21
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-TRUE16-NEXT:    s_branch .LBB2_12
+; GFX11-TRUE16-NEXT:  .LBB2_3:
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:  .LBB2_4: ; %bb16
+; GFX11-TRUE16-NEXT:    s_load_b32 s1, s[16:17], 0x54
+; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s23, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, -1
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s23, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB2_8
+; GFX11-TRUE16-NEXT:  ; %bb.5: ; %bb18.preheader
+; GFX11-TRUE16-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mul_hi_u32 s8, s29, s28
+; GFX11-TRUE16-NEXT:    s_mul_i32 s9, s29, s28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s8, s9, 1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, 0
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s8, s30
+; GFX11-TRUE16-NEXT:    s_mul_i32 s8, s8, s22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_mul_i32 s8, s8, s20
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s19, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[20:21], s[8:9], 1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s9
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v0, s[20:21]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX11-TRUE16-NEXT:    .p2align 6
+; GFX11-TRUE16-NEXT:  .LBB2_6: ; %bb18
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s1, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, s19, s13
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, 0xffff, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s8, 1
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s20, s2, exec_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s19, s13
+; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s13, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, 0x100, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_or_b32 s9, s13, s9
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_6
+; GFX11-TRUE16-NEXT:  ; %bb.7: ; %Flow
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:  .LBB2_8: ; %Flow12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_12
+; GFX11-TRUE16-NEXT:  ; %bb.9:
+; GFX11-TRUE16-NEXT:    s_xor_b32 s1, s1, -1
+; GFX11-TRUE16-NEXT:  .LBB2_10: ; %bb17
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_10
+; GFX11-TRUE16-NEXT:  ; %bb.11: ; %Flow6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT:  .LBB2_12: ; %Flow11
+; GFX11-TRUE16-NEXT:    s_and_b32 s20, s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_or_not1_b32 s0, s18, exec_lo
+; GFX11-TRUE16-NEXT:  .LBB2_13: ; %Flow9
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s3, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_15
+; GFX11-TRUE16-NEXT:  ; %bb.14: ; %bb43
+; GFX11-TRUE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-TRUE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_or_b32 s20, s20, exec_lo
+; GFX11-TRUE16-NEXT:  .LBB2_15: ; %Flow14
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s20
+; GFX11-TRUE16-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
+; GFX11-TRUE16-NEXT:    ; divergent unreachable
+; GFX11-TRUE16-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: f2:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-FAKE16-NEXT:    s_load_b32 s19, s[16:17], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s13
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_lo_u32 v0, s19, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_13
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %bb14
+; GFX11-FAKE16-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s21, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s24, -1, 0
+; GFX11-FAKE16-NEXT:    s_bitcmp0_b32 s21, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB2_3
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %bb15
+; GFX11-FAKE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-FAKE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s21, s14
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s21
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-FAKE16-NEXT:    s_branch .LBB2_12
+; GFX11-FAKE16-NEXT:  .LBB2_3:
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB2_12
+; GFX11-FAKE16-NEXT:  .LBB2_4: ; %bb16
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[16:17], 0x54
+; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s23, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s23, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB2_8
+; GFX11-FAKE16-NEXT:  ; %bb.5: ; %bb18.preheader
+; GFX11-FAKE16-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mul_hi_u32 s0, s29, s28
+; GFX11-FAKE16-NEXT:    s_mul_i32 s1, s29, s28
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s1, 1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s0, s0, s30
+; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s0, s22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s0, s20
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s19, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s1
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[20:21]
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX11-FAKE16-NEXT:    .p2align 6
+; GFX11-FAKE16-NEXT:  .LBB2_6: ; %bb18
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s8, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s19, s13
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, 0xffff, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s20, s9, exec_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s19, s13
+; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s13, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, 0x100, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s13, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_6
+; GFX11-FAKE16-NEXT:  ; %bb.7: ; %Flow
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:  .LBB2_8: ; %Flow12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_12
+; GFX11-FAKE16-NEXT:  ; %bb.9:
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, s8, -1
+; GFX11-FAKE16-NEXT:  .LBB2_10: ; %bb17
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_10
+; GFX11-FAKE16-NEXT:  ; %bb.11: ; %Flow6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT:  .LBB2_12: ; %Flow11
+; GFX11-FAKE16-NEXT:    s_and_b32 s20, s2, exec_lo
+; GFX11-FAKE16-NEXT:    s_or_not1_b32 s0, s18, exec_lo
+; GFX11-FAKE16-NEXT:  .LBB2_13: ; %Flow9
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s3, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_15
+; GFX11-FAKE16-NEXT:  ; %bb.14: ; %bb43
+; GFX11-FAKE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-FAKE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_or_b32 s20, s20, exec_lo
+; GFX11-FAKE16-NEXT:  .LBB2_15: ; %Flow14
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s20
+; GFX11-FAKE16-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
+; GFX11-FAKE16-NEXT:    ; divergent unreachable
+; GFX11-FAKE16-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
+; GFX11-FAKE16-NEXT:    s_endpgm
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i12 = mul i32 %arg, %i

diff  --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index ef91f36d60373..0d3340006f17e 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,10 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF64,DAGISEL-GFX11-WF64-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF64,DAGISEL-GFX11-WF64-FAKE16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF64 %s
 
@@ -794,18 +798,32 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_float(float in
 }
 
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inreg %a, half %b) {
-  ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_half
-  ; GISEL-GFX11: bb.1 (%ir-block.0):
-  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; GISEL-GFX11-NEXT: {{  $}}
-  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; GISEL-GFX11-NEXT:   S_ENDPGM 0
+  ; GISEL-GFX11-TRUE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; GISEL-GFX11-TRUE16: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; GISEL-GFX11-TRUE16-NEXT: {{  $}}
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+  ; GISEL-GFX11-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+  ; GISEL-GFX11-TRUE16-NEXT:   [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, [[COPY3]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; GISEL-GFX11-TRUE16-NEXT:   FLAT_STORE_SHORT_t16 [[COPY4]], [[V_ADD_F16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; GISEL-GFX11-TRUE16-NEXT:   S_ENDPGM 0
+  ;
+  ; GISEL-GFX11-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; GISEL-GFX11-FAKE16: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; GISEL-GFX11-FAKE16-NEXT: {{  $}}
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GISEL-GFX11-FAKE16-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; GISEL-GFX11-FAKE16-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; GISEL-GFX11-FAKE16-NEXT:   S_ENDPGM 0
   ;
   ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_half
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -820,29 +838,55 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
   ; GISEL-GFX10-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half
-  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-TRUE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX11-WF32-TRUE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_F16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_half
-  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX11-WF32-FAKE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-TRUE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX11-WF64-TRUE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_F16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_half
+  ; DAGISEL-GFX11-WF64-FAKE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_half
   ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):
@@ -962,18 +1006,32 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
 }
 
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg %a, i16 %b) {
-  ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
-  ; GISEL-GFX11: bb.1 (%ir-block.0):
-  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; GISEL-GFX11-NEXT: {{  $}}
-  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
-  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; GISEL-GFX11-NEXT:   S_ENDPGM 0
+  ; GISEL-GFX11-TRUE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; GISEL-GFX11-TRUE16: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; GISEL-GFX11-TRUE16-NEXT: {{  $}}
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY1]].lo16
+  ; GISEL-GFX11-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+  ; GISEL-GFX11-TRUE16-NEXT:   [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, [[COPY3]], 0, [[COPY2]], 0, 0, implicit $exec
+  ; GISEL-GFX11-TRUE16-NEXT:   [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; GISEL-GFX11-TRUE16-NEXT:   FLAT_STORE_SHORT_t16 [[COPY4]], [[V_ADD_NC_U16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; GISEL-GFX11-TRUE16-NEXT:   S_ENDPGM 0
+  ;
+  ; GISEL-GFX11-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; GISEL-GFX11-FAKE16: bb.1 (%ir-block.0):
+  ; GISEL-GFX11-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; GISEL-GFX11-FAKE16-NEXT: {{  $}}
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+  ; GISEL-GFX11-FAKE16-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
+  ; GISEL-GFX11-FAKE16-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; GISEL-GFX11-FAKE16-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; GISEL-GFX11-FAKE16-NEXT:   S_ENDPGM 0
   ;
   ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
   ; GISEL-GFX10: bb.1 (%ir-block.0):
@@ -988,29 +1046,55 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; GISEL-GFX10-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
-  ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-TRUE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX11-WF32-TRUE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_NC_U16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-TRUE16-NEXT:   S_ENDPGM 0
   ;
-  ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
-  ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0):
-  ; DAGISEL-GFX11-WF64-NEXT:   liveins: $sgpr0, $vgpr8
-  ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
+  ; DAGISEL-GFX11-WF32-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX11-WF32-FAKE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-FAKE16-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-TRUE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX11-WF64-TRUE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[V_ADD_NC_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_ADD_NC_U16_t16_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   FLAT_STORE_SHORT_t16 killed [[COPY3]], killed [[V_ADD_NC_U16_t16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-TRUE16-NEXT:   S_ENDPGM 0
+  ;
+  ; DAGISEL-GFX11-WF64-FAKE16-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
+  ; DAGISEL-GFX11-WF64-FAKE16: bb.0 (%ir-block.0):
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   liveins: $sgpr0, $vgpr8
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT: {{  $}}
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-FAKE16-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
   ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0):

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
index fe69dc4906243..91aba09e942f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
-; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
+; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL,DAGISEL-TRUE16
+; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL,DAGISEL-FAKE16
 ; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
-; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL,GISEL-TRUE16
+; RUN: llc -mtriple=amdgcn -global-isel -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL,GISEL-FAKE16
 
 declare i64 @llvm.amdgcn.ballot.i64(i1)
 declare i64 @llvm.ctpop.i64(i64)
@@ -47,13 +49,21 @@ define amdgpu_cs i64 @non_compare(i32 %x) {
 ; DAGISEL-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
 ; DAGISEL-NEXT:    ; return to shader part epilog
 ;
-; GISEL-LABEL: non_compare:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
-; GISEL-NEXT:    s_mov_b32 s1, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GISEL-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
-; GISEL-NEXT:    ; return to shader part epilog
+; GISEL-TRUE16-LABEL: non_compare:
+; GISEL-TRUE16:       ; %bb.0:
+; GISEL-TRUE16-NEXT:    v_and_b16 v0.l, 1, v0.l
+; GISEL-TRUE16-NEXT:    s_mov_b32 s1, 0
+; GISEL-TRUE16-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v0.l
+; GISEL-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GISEL-FAKE16-LABEL: non_compare:
+; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GISEL-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GISEL-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GISEL-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
+; GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %trunc = trunc i32 %x to i1
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
   ret i64 %ballot
@@ -182,3 +192,6 @@ true:
 false:
   ret i32 33
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; DAGISEL-FAKE16: {{.*}}
+; DAGISEL-TRUE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
index a009854542f21..a6352134bad25 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-DAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=ASM-DAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=ASM-DAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=ASM-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=ASM-GISEL-FAKE16 %s
 
 ; Test that we can use v0 for temporaries in the if.then block.
 define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) #0 {
@@ -27,28 +29,51 @@ define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr
 ; ASM-DAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; ASM-DAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; ASM-GISEL-LABEL: dead:
-; ASM-GISEL:       ; %bb.0: ; %entry
-; ASM-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; ASM-GISEL-NEXT:    s_wait_expcnt 0x0
-; ASM-GISEL-NEXT:    s_wait_samplecnt 0x0
-; ASM-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; ASM-GISEL-NEXT:    s_wait_kmcnt 0x0
-; ASM-GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; ASM-GISEL-NEXT:    v_mov_b32_e32 v0, v1
-; ASM-GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; ASM-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; ASM-GISEL-NEXT:    v_and_b32_e32 v1, 1, v4
-; ASM-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v1
-; ASM-GISEL-NEXT:    s_cbranch_execz .LBB0_2
-; ASM-GISEL-NEXT:  ; %bb.1: ; %if.then
-; ASM-GISEL-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; ASM-GISEL-NEXT:    global_store_b32 v[2:3], v0, off
-; ASM-GISEL-NEXT:    ; implicit-def: $vgpr0
-; ASM-GISEL-NEXT:  .LBB0_2: ; %if.end
-; ASM-GISEL-NEXT:    s_wait_alu 0xfffe
-; ASM-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; ASM-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; ASM-GISEL-TRUE16-LABEL: dead:
+; ASM-GISEL-TRUE16:       ; %bb.0: ; %entry
+; ASM-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; ASM-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; ASM-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; ASM-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; ASM-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; ASM-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; ASM-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v1
+; ASM-GISEL-TRUE16-NEXT:    s_mov_b32 s0, exec_lo
+; ASM-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; ASM-GISEL-TRUE16-NEXT:    v_and_b16 v1.l, 1, v4.l
+; ASM-GISEL-TRUE16-NEXT:    v_cmpx_ne_u16_e32 0, v1.l
+; ASM-GISEL-TRUE16-NEXT:    s_cbranch_execz .LBB0_2
+; ASM-GISEL-TRUE16-NEXT:  ; %bb.1: ; %if.then
+; ASM-GISEL-TRUE16-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; ASM-GISEL-TRUE16-NEXT:    global_store_b32 v[2:3], v0, off
+; ASM-GISEL-TRUE16-NEXT:    ; implicit-def: $vgpr0
+; ASM-GISEL-TRUE16-NEXT:  .LBB0_2: ; %if.end
+; ASM-GISEL-TRUE16-NEXT:    s_wait_alu 0xfffe
+; ASM-GISEL-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; ASM-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; ASM-GISEL-FAKE16-LABEL: dead:
+; ASM-GISEL-FAKE16:       ; %bb.0: ; %entry
+; ASM-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; ASM-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; ASM-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; ASM-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; ASM-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; ASM-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; ASM-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
+; ASM-GISEL-FAKE16-NEXT:    s_mov_b32 s0, exec_lo
+; ASM-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; ASM-GISEL-FAKE16-NEXT:    v_and_b32_e32 v1, 1, v4
+; ASM-GISEL-FAKE16-NEXT:    v_cmpx_ne_u32_e32 0, v1
+; ASM-GISEL-FAKE16-NEXT:    s_cbranch_execz .LBB0_2
+; ASM-GISEL-FAKE16-NEXT:  ; %bb.1: ; %if.then
+; ASM-GISEL-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; ASM-GISEL-FAKE16-NEXT:    global_store_b32 v[2:3], v0, off
+; ASM-GISEL-FAKE16-NEXT:    ; implicit-def: $vgpr0
+; ASM-GISEL-FAKE16-NEXT:  .LBB0_2: ; %if.end
+; ASM-GISEL-FAKE16-NEXT:    s_wait_alu 0xfffe
+; ASM-GISEL-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; ASM-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %dead = call i32 @llvm.amdgcn.dead.i32()
   br i1 %cond, label %if.then, label %if.end

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index dea0cc5fd07b6..addb395eccf11 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -1,23 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
 ; FIXME: GlobalIsel doesn't support BF16 for now.
-; xUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11
+; xUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
+; xUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
 
 declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bfloat %c)
 
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dot2_bf16_bf16 v1, s2, s3, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16:
+; SDAG-GFX11-TRUE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v0.l, s2, s3, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16:
+; SDAG-GFX11-FAKE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_dot2_bf16_bf16 v1, s2, s3, v1
+; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
@@ -32,17 +47,32 @@ entry:
 }
 
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v0, off, s2
-; GFX11-NEXT:    scratch_load_u16 v1, off, s3
-; GFX11-NEXT:    scratch_load_b32 v2, off, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11-NEXT:    scratch_store_b16 off, v0, s0
-; GFX11-NEXT:    s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
+; SDAG-GFX11-TRUE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s1
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s2
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s3
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v0.l, v1, v2, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s0
+; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
+; SDAG-GFX11-FAKE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s2
+; SDAG-GFX11-FAKE16-NEXT:    scratch_load_u16 v1, off, s3
+; SDAG-GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s1
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s0
+; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(5) %r,
     ptr addrspace(5) %a,
     ptr addrspace(5) %b,
@@ -62,13 +92,21 @@ entry:
 ; Make sure we do not violate constant bus restriction with 3 scalar inputs and simingly inlinable literal.
 
 define amdgpu_ps void @test_llvm_amdgcn_fdot2_bf16_bf16_sis(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    v_mov_b32_e32 v2, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2
-; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
+; SDAG-GFX11-TRUE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s1
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT:    v_dot2_bf16_bf16 v2.l, s0, 0x3f803f80, v2.l
+; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_sis:
+; SDAG-GFX11-FAKE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; SDAG-GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT:    v_dot2_bf16_bf16 v2, s0, 0x3f803f80, v2
+; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     <2 x bfloat> inreg %a,
     bfloat inreg %c) {
@@ -80,4 +118,5 @@ entry:
 
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
 ; SDAG-GFX11: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 066edea969883..19e03486d122d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -1,22 +1,63 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16
 
 declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
 
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT:    s_load_b32 s3, s[4:5], 0x0
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
+; SDAG-GFX11-TRUE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
+; SDAG-GFX11-FAKE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
+; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
+; GISEL-GFX11-TRUE16:       ; %bb.0: ; %entry
+; GISEL-GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GISEL-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[6:7]
+; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GISEL-GFX11-TRUE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, s2, s3, v0.l
+; GISEL-GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GISEL-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GISEL-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
+; GISEL-GFX11-FAKE16:       ; %bb.0: ; %entry
+; GISEL-GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GISEL-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GISEL-GFX11-FAKE16-NEXT:    s_load_b32 s3, s[4:5], 0x0
+; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    v_dot2_f16_f16 v1, s2, s3, v1
+; GISEL-GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GISEL-GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
@@ -31,29 +72,59 @@ entry:
 }
 
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
-; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
-; SDAG-GFX11:       ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    scratch_load_b32 v0, off, s2
-; SDAG-GFX11-NEXT:    scratch_load_u16 v1, off, s3
-; SDAG-GFX11-NEXT:    scratch_load_b32 v2, off, s1
-; SDAG-GFX11-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-GFX11-NEXT:    v_dot2_f16_f16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; SDAG-GFX11-NEXT:    scratch_store_b16 off, v0, s0
-; SDAG-GFX11-NEXT:    s_endpgm
+; SDAG-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
+; SDAG-GFX11-TRUE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s1
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s2
+; SDAG-GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s3
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, v1, v2, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s0
+; SDAG-GFX11-TRUE16-NEXT:    s_endpgm
+;
+; SDAG-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
+; SDAG-GFX11-FAKE16:       ; %bb.0: ; %entry
+; SDAG-GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s2
+; SDAG-GFX11-FAKE16-NEXT:    scratch_load_u16 v1, off, s3
+; SDAG-GFX11-FAKE16-NEXT:    scratch_load_b32 v2, off, s1
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_dot2_f16_f16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; SDAG-GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s0
+; SDAG-GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GISEL-GFX11-TRUE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
+; GISEL-GFX11-TRUE16:       ; %bb.0: ; %entry
+; GISEL-GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s1
+; GISEL-GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s2
+; GISEL-GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v0, off, s3
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GISEL-GFX11-TRUE16-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GISEL-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-TRUE16-NEXT:    v_dot2_f16_f16 v0.l, v1, v2, v0.l
+; GISEL-GFX11-TRUE16-NEXT:    scratch_store_b16 off, v0, s0
+; GISEL-GFX11-TRUE16-NEXT:    s_endpgm
 ;
-; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
-; GISEL-GFX11:       ; %bb.0: ; %entry
-; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    scratch_load_b32 v0, off, s1
-; GISEL-GFX11-NEXT:    scratch_load_b32 v1, off, s2
-; GISEL-GFX11-NEXT:    scratch_load_u16 v2, off, s3
-; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX11-NEXT:    v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GISEL-GFX11-NEXT:    scratch_store_b16 off, v0, s0
-; GISEL-GFX11-NEXT:    s_endpgm
+; GISEL-GFX11-FAKE16-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
+; GISEL-GFX11-FAKE16:       ; %bb.0: ; %entry
+; GISEL-GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    scratch_load_b32 v0, off, s1
+; GISEL-GFX11-FAKE16-NEXT:    scratch_load_b32 v1, off, s2
+; GISEL-GFX11-FAKE16-NEXT:    scratch_load_u16 v2, off, s3
+; GISEL-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-GFX11-FAKE16-NEXT:    v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GISEL-GFX11-FAKE16-NEXT:    scratch_store_b16 off, v0, s0
+; GISEL-GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(5) %r,
     ptr addrspace(5) %a,
     ptr addrspace(5) %b,
@@ -71,3 +142,7 @@ entry:
 }
 
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
+; GISEL-GFX11: {{.*}}
+; SDAG-GFX11: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
index 40b4b33e74a6f..93f0080deabe7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; GFX9-LABEL: gather4_2d:
@@ -16,25 +18,45 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -52,25 +74,47 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_cube:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_cube:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_cube:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4 v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_cube:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_cube:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_cube:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -88,25 +132,47 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_2darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_2darray:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_2darray:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4 v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_2darray:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_2darray:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_2darray:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -124,25 +190,45 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_c_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_c_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_c_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_c_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_c_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -160,25 +246,47 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4_cl v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -198,25 +306,45 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_c_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_c_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_c_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_c_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_c_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -234,25 +362,45 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_b_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_b_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_b_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_b_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_b_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_b_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -270,25 +418,45 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_b_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_c_b_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_c_b_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_c_b_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_c_b_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_c_b_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -308,25 +476,48 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_b_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_b_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_b_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4_b_cl v[0:3], v[2:4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_b_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_b_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_b_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -347,25 +538,45 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_b_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_c_b_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_c_b_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_c_b_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_c_b_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_c_b_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f16.f16(i32 1, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -380,19 +591,35 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_l_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_l_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_l_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    image_gather4_l v[0:3], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_l_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_l_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_l_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -409,19 +636,33 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_l_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_c_l_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_c_l_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_c_l_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_c_l_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_c_l_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -436,19 +677,33 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_lz_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_lz_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_lz_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_lz_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_lz_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_lz_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -463,19 +718,33 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_lz_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: gather4_c_lz_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    image_gather4_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: gather4_c_lz_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: gather4_c_lz_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: gather4_c_lz_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    image_gather4_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: gather4_c_lz_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    image_gather4_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -503,3 +772,6 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32, float, hal
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
 attributes #2 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
+; GFX12: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
index e789b964d3cf1..f188d37c904ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
 
 define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
 ; GFX11-LABEL: load_2dmsaa:
@@ -281,40 +283,74 @@ main_body:
 }
 
 define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %fragid) {
-; GFX11-LABEL: load_2dmsaa_a16:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x99,0x01,0x61,0xf0,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: load_2dmsaa_a16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l ; encoding: [0x02,0x39,0x06,0x7e]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l ; encoding: [0x00,0x39,0x04,0x7e]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l ; encoding: [0x01,0x39,0x04,0x7f]
+; GFX11-TRUE16-NEXT:    image_msaa_load v[0:3], v[2:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x01,0x61,0xf0,0x02,0x00,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: load_2dmsaa_a16:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: load_2dmsaa_a16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x99,0x01,0x61,0xf0,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: load_2dmsaa_a16:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX12-TRUE16-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: load_2dmsaa_a16:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32 1, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v
 }
 
 define amdgpu_ps <4 x float> @load_2darraymsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) {
-; GFX11-LABEL: load_2darraymsaa_a16:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x04,0x61,0xf0,0x01,0x00,0x00,0x00]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: load_2darraymsaa_a16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l ; encoding: [0x02,0x39,0x08,0x7e]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.l ; encoding: [0x03,0x39,0x08,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l ; encoding: [0x00,0x39,0x06,0x7e]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v1.l ; encoding: [0x01,0x39,0x06,0x7f]
+; GFX11-TRUE16-NEXT:    image_msaa_load v[0:3], v[3:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x04,0x61,0xf0,0x03,0x00,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: load_2darraymsaa_a16:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x47,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: load_2darraymsaa_a16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v0, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_msaa_load v[0:3], v[1:2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x04,0x61,0xf0,0x01,0x00,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: load_2darraymsaa_a16:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX12-TRUE16-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x47,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: load_2darraymsaa_a16:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x47,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32 4, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
   ret <4 x float> %v

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
index 10a85aa7c02c7..4a58091a14617 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) {
 ; GFX9-LABEL: sample_1d:
@@ -67,25 +69,45 @@ define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -113,25 +135,47 @@ define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_3d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_3d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_3d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_3d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_3d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_3d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -159,25 +203,47 @@ define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_cube:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_cube:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_cube:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_cube:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_cube:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_cube:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -205,25 +271,45 @@ define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_1darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_1darray:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_1darray:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_1darray:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_1darray:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_1darray:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -251,25 +337,47 @@ define amdgpu_ps <4 x float> @sample_2darray(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_2darray:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_2darray:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_2darray:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_2darray:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_2darray:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_2darray:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -338,25 +446,45 @@ define amdgpu_ps <4 x float> @sample_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_c v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -384,25 +512,45 @@ define amdgpu_ps <4 x float> @sample_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_cl_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_cl_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_cl_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_cl_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_cl_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_cl_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -430,25 +578,47 @@ define amdgpu_ps <4 x float> @sample_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_cl v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -476,25 +646,45 @@ define amdgpu_ps <4 x float> @sample_c_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_cl_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_cl_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_c_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_cl_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_cl_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_cl_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_c_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_cl_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_c_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -524,25 +714,45 @@ define amdgpu_ps <4 x float> @sample_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -611,25 +821,45 @@ define amdgpu_ps <4 x float> @sample_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_b_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_b_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_b_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_b_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_b_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_b_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_b v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -698,25 +928,45 @@ define amdgpu_ps <4 x float> @sample_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_b_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_b_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_b_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_b_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_b_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_b_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_c_b v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -744,25 +994,45 @@ define amdgpu_ps <4 x float> @sample_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_b_cl_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_b_cl_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_b_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_b_cl_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_b_cl_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_b_cl_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_b_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_b_cl_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_b_cl v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -792,25 +1062,48 @@ define amdgpu_ps <4 x float> @sample_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_b_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_b_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_b_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_b_cl v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_b_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_b_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_b_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -838,25 +1131,45 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_b_cl_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_b_cl_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_b_cl_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_b_cl_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_b_cl_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_b_cl_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -887,25 +1200,45 @@ define amdgpu_ps <4 x float> @sample_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_b_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s12, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX11-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_b_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s12, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX12-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_b_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-TRUE16-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_b_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX11-FAKE16-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_b_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-TRUE16-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_b_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s12, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX12-FAKE16-NEXT:    image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f16.f16(i32 15, half %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -960,23 +1293,44 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_d_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v1, v0, 0x5040100
-; GFX11-NEXT:    image_sample_d_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_d_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_d_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT:    image_sample_d_g16 v[0:3], v[4:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_d_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_d_g16 v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_d_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_d_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1008,25 +1362,48 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_d_3d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
-; GFX11-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[7:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_d_3d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_mov_b32_e32 v10, v8
-; GFX12-NEXT:    v_mov_b32_e32 v8, v5
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v9, v7, v6, 0x5040100
-; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[8:10]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_d_3d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[8:9]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_d_3d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[7:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_d_3d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[7:9]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_d_3d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v10, v8
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v8, v5
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v9, v7, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[8:10]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1083,23 +1460,41 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_d_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_d_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1121,19 +1516,33 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_d_cl_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_d_cl_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_d_cl_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_d_cl_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_d_cl_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_d_cl_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_d_cl_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v1, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1159,23 +1568,45 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_d_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_d_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_d_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v1.l
+; GFX11-TRUE16-NEXT:    image_sample_d_cl_g16 v[0:3], v[4:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_d_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_d_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_d_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1197,19 +1628,33 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_cl_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_d_cl_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_cl_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX11-TRUE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_d_cl_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_cl_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX12-TRUE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_cl_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1237,23 +1682,43 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_d_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v6, v6, v5, 0x5040100
-; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_d_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[7:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v6, v5, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1275,19 +1740,33 @@ define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_l_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_l_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_l_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_l_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_l_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_l_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1309,19 +1788,35 @@ define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_l_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_l_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_l_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT:    image_sample_l v[0:3], v[2:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_l_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_l_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_l_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1343,19 +1838,33 @@ define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_l_1d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_l_1d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    image_sample_c_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_l_1d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_l_1d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_l_1d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    image_sample_c_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_l_1d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_l v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1379,19 +1888,33 @@ define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_l_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_l_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_l_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_l_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_l_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_l_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1442,19 +1965,33 @@ define amdgpu_ps <4 x float> @sample_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_lz_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_lz_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX12-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_lz_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_lz_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_lz_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_lz_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1505,19 +2042,33 @@ define amdgpu_ps <4 x float> @sample_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_lz_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_lz_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX12-NEXT:    image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_lz_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_lz_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_lz_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-TRUE16-NEXT:    image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_lz_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_lz v[0:3], [v0, v1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -1549,23 +2100,46 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_o_2darray_V1:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
-; GFX11-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[7:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_d_o_2darray_V1:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v6, v5, v4, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[8:9]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[7:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[7:9]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v5, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret float %v
@@ -1597,23 +2171,46 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_o_2darray_V2:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
-; GFX11-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[7:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: sample_c_d_o_2darray_V2:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v6, v5, v4, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX12-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[8:9]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
+; GFX11-FAKE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[7:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v8.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v6.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v7.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v5.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX12-TRUE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[7:9]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v7, v7, v6, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v5, v4, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX12-FAKE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <2 x float> %v

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 8861ff4c78137..323d0fbe741a4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -3,8 +3,10 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; TONGA-LABEL: image_sample_2d_f16:
@@ -121,33 +123,63 @@ define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    global_store_dword v4, v1, s[12:13]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: image_sample_2d_f16_tfe:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    s_mov_b32 s14, exec_lo
-; GFX11-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v5, v4
-; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX11-NEXT:    image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v4, v1, s[12:13]
-; GFX11-NEXT:    ; return to shader part epilog
-;
-; GFX12-LABEL: image_sample_2d_f16_tfe:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s14, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v0
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    s_and_b32 exec_lo, exec_lo, s14
-; GFX12-NEXT:    image_sample v[0:1], [v3, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    global_store_b32 v4, v1, s[12:13]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: image_sample_2d_f16_tfe:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, exec_lo
+; GFX11-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s14
+; GFX11-TRUE16-NEXT:    image_sample v[3:4], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v4, s[12:13]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: image_sample_2d_f16_tfe:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, exec_lo
+; GFX11-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s14
+; GFX11-FAKE16-NEXT:    image_sample v[0:1], v[2:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b32 v4, v1, s[12:13]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: image_sample_2d_f16_tfe:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s14, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 exec_lo, exec_lo, s14
+; GFX12-TRUE16-NEXT:    image_sample v[3:4], [v0, v1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; GFX12-TRUE16-NEXT:    global_store_b32 v2, v4, s[12:13]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: image_sample_2d_f16_tfe:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s14, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v3, v0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-FAKE16-NEXT:    s_and_b32 exec_lo, exec_lo, s14
+; GFX12-FAKE16-NEXT:    image_sample v[0:1], [v3, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b32 v4, v1, s[12:13]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
   %tex.vec = extractvalue {half, i32} %tex, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
index aa24ac394730e..e7b048dda1c1f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
 ; GFX10-LABEL: sample_d_1d:
@@ -35,21 +37,37 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_d_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_d_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX11-TRUE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: sample_d_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_d_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x00]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_d_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX12-TRUE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_d_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -66,21 +84,37 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_d_3d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x03,0x05,0x06]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_d_3d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX11-TRUE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x03,0x05,0x06]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: sample_d_3d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x03,0x05]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_d_3d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v[6:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x09,0x0f,0xe4,0xf0,0x00,0x00,0x00,0x08,0x02,0x03,0x05,0x06]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_d_3d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX12-TRUE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x03,0x05]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_d_3d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_sample_d_g16 v[0:3], [v0, v2, v3, v[5:8]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x40,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x03,0x05]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -118,21 +152,37 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l ; encoding: [0x02,0x39,0x02,0x7f]
+; GFX11-TRUE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: sample_c_d_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_c_d_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0xe8,0xf0,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l ; encoding: [0x02,0x39,0x02,0x7f]
+; GFX12-TRUE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_sample_c_d_g16 v[0:3], [v0, v1, v3, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x80,0xce,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -170,21 +220,37 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_d_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_d_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX11-TRUE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: sample_d_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_d_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x7c,0xf1,0x00,0x00,0x00,0x08,0x02,0x04,0x05,0x06]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_d_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l ; encoding: [0x01,0x39,0x00,0x7f]
+; GFX12-TRUE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_d_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v[5:6]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0xd7,0xe7,0x00,0x00,0x00,0x04,0x00,0x02,0x04,0x05]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -224,21 +290,37 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_cl_2d:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_cl_2d:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l ; encoding: [0x02,0x39,0x02,0x7f]
+; GFX11-TRUE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: sample_c_d_cl_2d:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_c_d_cl_2d:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v[6:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x05,0x0f,0x50,0xf1,0x00,0x00,0x00,0x08,0x01,0x03,0x05,0x06]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_cl_2d:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v4.l ; encoding: [0x04,0x39,0x06,0x7f]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l ; encoding: [0x02,0x39,0x02,0x7f]
+; GFX12-TRUE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_cl_2d:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100 ; encoding: [0x03,0x00,0x44,0xd6,0x04,0x07,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100 ; encoding: [0x01,0x00,0x44,0xd6,0x02,0x03,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v[5:7]], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x00,0xd5,0xe7,0x00,0x00,0x00,0x04,0x00,0x01,0x03,0x05]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <4 x float> %v
@@ -257,21 +339,40 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_o_2darray_V1:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x04,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v5.l ; encoding: [0x05,0x39,0x08,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX11-TRUE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x04,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: sample_c_d_o_2darray_V1:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x0f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x04,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v9, v5 ; encoding: [0x05,0x03,0x12,0x7e]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l ; encoding: [0x04,0x39,0x0a,0x7e]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v9.l ; encoding: [0x09,0x39,0x0a,0x7f]
+; GFX12-TRUE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x0f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_o_2darray_V1:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_sample_c_d_o_g16 v0, [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x0f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret float %v
@@ -290,21 +391,40 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: sample_c_d_o_2darray_V2:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX11-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x06,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v5.l ; encoding: [0x05,0x39,0x08,0x7f]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX11-TRUE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x06,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: sample_c_d_o_2darray_V2:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
-; GFX12-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x8f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
-; GFX12-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100 ; encoding: [0x04,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX11-FAKE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v[6:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x15,0x06,0xf0,0xf0,0x00,0x00,0x00,0x08,0x01,0x02,0x04,0x06]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v9, v5 ; encoding: [0x05,0x03,0x12,0x7e]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l ; encoding: [0x04,0x39,0x0a,0x7e]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l ; encoding: [0x03,0x39,0x04,0x7f]
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) ; encoding: [0x03,0x00,0x87,0xbf]
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v9.l ; encoding: [0x09,0x39,0x0a,0x7f]
+; GFX12-TRUE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x8f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: sample_c_d_o_2darray_V2:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v4, 0x5040100 ; encoding: [0x05,0x00,0x44,0xd6,0x05,0x09,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
+; GFX12-FAKE16-NEXT:    image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v[5:8]], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x00,0x8f,0xe5,0x00,0x00,0x00,0x04,0x00,0x01,0x02,0x05]
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
   ret <2 x float> %v

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
index 90dfab501d0a4..4873b42a235e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.noret.ll
@@ -1,10 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps void @sample_1d_nortn(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
 ; GFX10PLUS-LABEL: sample_1d_nortn:
@@ -476,4 +481,10 @@ attributes #1 = { nounwind readonly }
 ; GFX10-SDAG: {{.*}}
 ; GFX11: {{.*}}
 ; GFX11-GISEL: {{.*}}
+; GFX11-GISEL-FAKE16: {{.*}}
 ; GFX11-SDAG: {{.*}}
+; GFX11-SDAG-FAKE16: {{.*}}
+; GFX11-SDAG-TRUE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}
+; GFX12-SDAG-FAKE16: {{.*}}
+; GFX12-SDAG-TRUE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index 19da3f4503aa5..4d937dade51f4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
 ; GFX11-LABEL: v_interp_f32:
@@ -227,23 +228,41 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: v_interp_f16:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    s_mov_b32 m0, s2
-; GFX12-NEXT:    ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GFX12-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GFX12-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GFX12-NEXT:    v_add_f16_e32 v0, v3, v0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-TRUE16-LABEL: v_interp_f16:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    s_mov_b32 m0, s2
+; GFX12-TRUE16-NEXT:    ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
+; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX12-TRUE16-NEXT:    v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX12-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: v_interp_f16:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    s_mov_b32 m0, s2
+; GFX12-FAKE16-NEXT:    ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
+; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX12-FAKE16-NEXT:    v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX12-FAKE16-NEXT:    v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
@@ -291,23 +310,41 @@ define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inre
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: v_interp_rtz_f16:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_mov_b32 s3, exec_lo
-; GFX12-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX12-NEXT:    s_mov_b32 m0, s2
-; GFX12-NEXT:    ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
-; GFX12-NEXT:    s_mov_b32 exec_lo, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
-; GFX12-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
-; GFX12-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
-; GFX12-NEXT:    v_add_f16_e32 v0, v3, v0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-TRUE16-LABEL: v_interp_rtz_f16:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-TRUE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-TRUE16-NEXT:    s_mov_b32 m0, s2
+; GFX12-TRUE16-NEXT:    ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
+; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
+; GFX12-TRUE16-NEXT:    v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
+; GFX12-TRUE16-NEXT:    v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: v_interp_rtz_f16:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-FAKE16-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX12-FAKE16-NEXT:    s_mov_b32 m0, s2
+; GFX12-FAKE16-NEXT:    ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
+; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GFX12-FAKE16-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GFX12-FAKE16-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, v3, v0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
   %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
@@ -344,17 +381,30 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-LABEL: v_interp_f16_imm_params:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    v_mov_b32_e32 v2, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
-; GFX12-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX12-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-TRUE16-LABEL: v_interp_f16_imm_params:
+; GFX12-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
+; GFX12-TRUE16-NEXT:    v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: v_interp_f16_imm_params:
+; GFX12-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
+; GFX12-FAKE16-NEXT:    v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
   %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 4509d954c5e8b..9606c68684957 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -3,9 +3,12 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1013 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX10,GFX1030 %s
 ; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=PRE-GFX12,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float3 ray_origin, float3 ray_dir, float3 ray_inv_dir, uint4 texture_descr)
 ; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float3 ray_origin, half3 ray_dir, half3 ray_inv_dir, uint4 texture_descr)
@@ -69,67 +72,138 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 inreg %node_ptr, f
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: image_bvh_intersect_ray_a16:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s7, 16
-; GFX11-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
-; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s5, s7
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_pack_ll_b32_b16 s4, s6, s8
-; GFX11-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4
-; GFX11-NEXT:    s_mov_b32 s15, s12
-; GFX11-NEXT:    s_mov_b32 s14, s11
-; GFX11-NEXT:    s_mov_b32 s13, s10
-; GFX11-NEXT:    s_mov_b32 s12, s9
-; GFX11-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: image_bvh_intersect_ray_a16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s7, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s5
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s8
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s2
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v6, s0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, s1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s9
+; GFX11-TRUE16-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: image_bvh_intersect_ray_a16:
-; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-SDAG-NEXT:    s_lshr_b32 s2, s7, 16
-; GFX12-SDAG-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
-; GFX12-SDAG-NEXT:    s_pack_ll_b32_b16 s3, s5, s7
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3
-; GFX12-SDAG-NEXT:    s_pack_ll_b32_b16 s4, s6, s8
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4
-; GFX12-SDAG-NEXT:    s_mov_b32 s15, s12
-; GFX12-SDAG-NEXT:    s_mov_b32 s14, s11
-; GFX12-SDAG-NEXT:    s_mov_b32 s13, s10
-; GFX12-SDAG-NEXT:    s_mov_b32 s12, s9
-; GFX12-SDAG-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: image_bvh_intersect_ray_a16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s7, 16
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s6, s8
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s9
+; GFX11-FAKE16-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: image_bvh_intersect_ray_a16:
-; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    s_mov_b32 s20, s2
-; GFX12-GISEL-NEXT:    s_mov_b32 s22, s4
-; GFX12-GISEL-NEXT:    s_pack_ll_b32_b16 s4, s7, s5
-; GFX12-GISEL-NEXT:    s_mov_b32 s21, s3
-; GFX12-GISEL-NEXT:    s_pack_hh_b32_b16 s5, s7, s5
-; GFX12-GISEL-NEXT:    s_pack_ll_b32_b16 s6, s8, s6
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
-; GFX12-GISEL-NEXT:    s_mov_b32 s16, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s17, s10
-; GFX12-GISEL-NEXT:    s_mov_b32 s18, s11
-; GFX12-GISEL-NEXT:    s_mov_b32 s19, s12
-; GFX12-GISEL-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[16:19] a16
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-TRUE16-LABEL: image_bvh_intersect_ray_a16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-TRUE16-NEXT:    s_lshr_b32 s2, s7, 16
+; GFX12-SDAG-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s3, s6
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s5
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s7
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s6, s8
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s4, s5
+; GFX12-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s2
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v6, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s15, s12
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s14, s11
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s13, s10
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s12, s9
+; GFX12-SDAG-TRUE16-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: image_bvh_intersect_ray_a16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-SDAG-FAKE16-NEXT:    s_lshr_b32 s2, s7, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s3, s2
+; GFX12-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s7
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3
+; GFX12-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s6, s8
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s15, s12
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s14, s11
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s13, s10
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s12, s9
+; GFX12-SDAG-FAKE16-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[12:15] a16
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: image_bvh_intersect_ray_a16:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s20, s2
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s22, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s7, s5
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s21, s3
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s8, s6
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s5, s4
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s16, s9
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s17, s10
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s18, s11
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s19, s12
+; GFX12-GISEL-TRUE16-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[16:19] a16
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: image_bvh_intersect_ray_a16:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s20, s2
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s22, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s7, s5
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s21, s3
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_hh_b32_b16 s5, s7, s5
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s8, s6
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s4
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s16, s9
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s17, s10
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s18, s11
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s19, s12
+; GFX12-GISEL-FAKE16-NEXT:    image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[16:19] a16
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>
@@ -189,70 +263,142 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 inreg %node_ptr,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: image_bvh64_intersect_ray_a16:
-; GFX11:       ; %bb.0: ; %main_body
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
-; GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
-; GFX11-NEXT:    s_lshr_b32 s3, s6, 16
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s6, s8
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1
-; GFX11-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX11-NEXT:    v_mov_b32_e32 v8, s2
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
-; GFX11-NEXT:    s_pack_ll_b32_b16 s3, s7, s9
-; GFX11-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
-; GFX11-NEXT:    s_mov_b32 s15, s13
-; GFX11-NEXT:    s_mov_b32 s14, s12
-; GFX11-NEXT:    s_mov_b32 s13, s11
-; GFX11-NEXT:    s_mov_b32 s12, s10
-; GFX11-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: image_bvh64_intersect_ray_a16:
+; GFX11-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, s0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s6, 16
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s8
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s9
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s0
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s13
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s10
+; GFX11-TRUE16-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: image_bvh64_intersect_ray_a16:
-; GFX12-SDAG:       ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
-; GFX12-SDAG-NEXT:    s_lshr_b32 s3, s6, 16
-; GFX12-SDAG-NEXT:    s_pack_ll_b32_b16 s1, s6, s8
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-SDAG-NEXT:    s_lshr_b32 s0, s8, 16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v8, s2
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
-; GFX12-SDAG-NEXT:    s_pack_ll_b32_b16 s3, s7, s9
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
-; GFX12-SDAG-NEXT:    s_mov_b32 s15, s13
-; GFX12-SDAG-NEXT:    s_mov_b32 s14, s12
-; GFX12-SDAG-NEXT:    s_mov_b32 s13, s11
-; GFX12-SDAG-NEXT:    s_mov_b32 s12, s10
-; GFX12-SDAG-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
+; GFX11-FAKE16-LABEL: image_bvh64_intersect_ray_a16:
+; GFX11-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s6, 16
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s6, s8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, s2
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s7, s9
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s13
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s10
+; GFX11-FAKE16-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
 ;
-; GFX12-GISEL-LABEL: image_bvh64_intersect_ray_a16:
-; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    s_mov_b32 s20, s3
-; GFX12-GISEL-NEXT:    s_mov_b32 s21, s4
-; GFX12-GISEL-NEXT:    s_pack_ll_b32_b16 s4, s8, s6
-; GFX12-GISEL-NEXT:    s_mov_b32 s22, s5
-; GFX12-GISEL-NEXT:    s_pack_hh_b32_b16 s5, s8, s6
-; GFX12-GISEL-NEXT:    s_pack_ll_b32_b16 s6, s9, s7
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX12-GISEL-NEXT:    s_mov_b32 s16, s10
-; GFX12-GISEL-NEXT:    s_mov_b32 s17, s11
-; GFX12-GISEL-NEXT:    s_mov_b32 s18, s12
-; GFX12-GISEL-NEXT:    s_mov_b32 s19, s13
-; GFX12-GISEL-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[16:19] a16
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-SDAG-TRUE16-LABEL: image_bvh64_intersect_ray_a16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX12-SDAG-TRUE16-NEXT:    s_lshr_b32 s3, s6, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s1, s7
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s3, s6
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s8
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s9
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX12-SDAG-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v4, s0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v8, s2
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s15, s13
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s14, s12
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s13, s11
+; GFX12-SDAG-TRUE16-NEXT:    s_mov_b32 s12, s10
+; GFX12-SDAG-TRUE16-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-SDAG-FAKE16-LABEL: image_bvh64_intersect_ray_a16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1
+; GFX12-SDAG-FAKE16-NEXT:    s_lshr_b32 s3, s6, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s6, s8
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-SDAG-FAKE16-NEXT:    s_lshr_b32 s0, s8, 16
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v8, s2
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s3, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s7, s9
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s15, s13
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s14, s12
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s13, s11
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s12, s10
+; GFX12-SDAG-FAKE16-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[12:15] a16
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-TRUE16-LABEL: image_bvh64_intersect_ray_a16:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %main_body
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s21, s4
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s8, s6
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s20, s3
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s22, s5
+; GFX12-GISEL-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s9, s7
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s5, s4
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21
+; GFX12-GISEL-TRUE16-NEXT:    v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v4, s5
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s16, s10
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s17, s11
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s18, s12
+; GFX12-GISEL-TRUE16-NEXT:    s_mov_b32 s19, s13
+; GFX12-GISEL-TRUE16-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[16:19] a16
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-GISEL-FAKE16-LABEL: image_bvh64_intersect_ray_a16:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %main_body
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s20, s3
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s21, s4
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s8, s6
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s22, s5
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_hh_b32_b16 s5, s8, s6
+; GFX12-GISEL-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s9, s7
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s4
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v1, s21
+; GFX12-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v5, s6
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v4, s5
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s16, s10
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s17, s11
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s18, s12
+; GFX12-GISEL-FAKE16-NEXT:    s_mov_b32 s19, s13
+; GFX12-GISEL-FAKE16-NEXT:    image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[16:19] a16
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr)
   %r = bitcast <4 x i32> %v to <4 x float>

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index 46829b07f265d..30b7b3b472f08 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
 
 define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
 ; CHECK-LABEL: raw_atomic_buffer_load_i32:
@@ -207,26 +209,66 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
-; CHECK-LABEL: raw_atomic_buffer_load_v4i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB7_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
+; CHECK-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; CHECK-SDAG-TRUE16:       ; %bb.0: ; %bb
+; CHECK-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-SDAG-TRUE16-NEXT:  .LBB7_1: ; %bb1
+; CHECK-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; CHECK-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
+; CHECK-FAKE16:       ; %bb.0: ; %bb
+; CHECK-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-FAKE16-NEXT:  .LBB7_1: ; %bb1
+; CHECK-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-FAKE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-FAKE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; CHECK-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-FAKE16-NEXT:    s_endpgm
+;
+; CHECK-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
+; CHECK-GISEL-TRUE16:       ; %bb.0: ; %bb
+; CHECK-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-GISEL-TRUE16-NEXT:  .LBB7_1: ; %bb1
+; CHECK-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; CHECK-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 4813a71f5c7b5..643805d6be93e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) {
 ; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
@@ -207,26 +209,66 @@ bb2:
 }
 
 define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) {
-; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:  .LBB7_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB7_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
+; CHECK-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; CHECK-SDAG-TRUE16:       ; %bb.0: ; %bb
+; CHECK-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-SDAG-TRUE16-NEXT:  .LBB7_1: ; %bb1
+; CHECK-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; CHECK-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; CHECK-FAKE16:       ; %bb.0: ; %bb
+; CHECK-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-FAKE16-NEXT:  .LBB7_1: ; %bb1
+; CHECK-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-FAKE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-FAKE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; CHECK-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-FAKE16-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-FAKE16-NEXT:    s_endpgm
+;
+; CHECK-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
+; CHECK-GISEL-TRUE16:       ; %bb.0: ; %bb
+; CHECK-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-GISEL-TRUE16-NEXT:  .LBB7_1: ; %bb1
+; CHECK-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; CHECK-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB7_1
+; CHECK-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll
index cc75531e4953e..3fd100d4a7c89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll
@@ -3,7 +3,8 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PREGFX10-PACKED %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX10-PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED,GFX11-PACKED-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=GFX11-PACKED,GFX11-PACKED-FAKE16 %s
 
 define amdgpu_ps half @tbuffer_load_d16_x(ptr addrspace(8) inreg %rsrc) {
 ; PREGFX10-UNPACKED-LABEL: tbuffer_load_d16_x:
@@ -90,12 +91,19 @@ define amdgpu_ps half @tbuffer_load_d16_xyz(ptr addrspace(8) inreg %rsrc) {
 ; GFX10-PACKED-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-PACKED-NEXT:    ; return to shader part epilog
 ;
-; GFX11-PACKED-LABEL: tbuffer_load_d16_xyz:
-; GFX11-PACKED:       ; %bb.0: ; %main_body
-; GFX11-PACKED-NEXT:    tbuffer_load_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
-; GFX11-PACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PACKED-NEXT:    v_mov_b32_e32 v0, v1
-; GFX11-PACKED-NEXT:    ; return to shader part epilog
+; GFX11-PACKED-TRUE16-LABEL: tbuffer_load_d16_xyz:
+; GFX11-PACKED-TRUE16:       ; %bb.0: ; %main_body
+; GFX11-PACKED-TRUE16-NEXT:    tbuffer_load_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
+; GFX11-PACKED-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PACKED-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-PACKED-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-PACKED-FAKE16-LABEL: tbuffer_load_d16_xyz:
+; GFX11-PACKED-FAKE16:       ; %bb.0: ; %main_body
+; GFX11-PACKED-FAKE16-NEXT:    tbuffer_load_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
+; GFX11-PACKED-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PACKED-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-PACKED-FAKE16-NEXT:    ; return to shader part epilog
 main_body:
   %data = call <3 x half> @llvm.amdgcn.raw.ptr.tbuffer.load.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 22, i32 0)
   %elt = extractelement <3 x half> %data, i32 2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index eb2d95e4db2d5..2d8e9f2ab39e4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
 
 define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %index) {
 ; CHECK-LABEL: struct_atomic_buffer_load_i32:
@@ -257,29 +259,75 @@ bb2:
 }
 
 define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32 %index) {
-; CHECK-LABEL: struct_atomic_buffer_load_v4i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s6
-; CHECK-NEXT:  .LBB8_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
+; CHECK-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; CHECK-SDAG-TRUE16:       ; %bb.0: ; %bb
+; CHECK-SDAG-TRUE16-NEXT:    s_clause 0x1
+; CHECK-SDAG-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; CHECK-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-SDAG-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; CHECK-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
+; CHECK-FAKE16:       ; %bb.0: ; %bb
+; CHECK-FAKE16-NEXT:    s_clause 0x1
+; CHECK-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; CHECK-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; CHECK-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-FAKE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-FAKE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-FAKE16-NEXT:    s_endpgm
+;
+; CHECK-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
+; CHECK-GISEL-TRUE16:       ; %bb.0: ; %bb
+; CHECK-GISEL-TRUE16-NEXT:    s_clause 0x1
+; CHECK-GISEL-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; CHECK-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-GISEL-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; CHECK-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; CHECK-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
index ffa598d634e43..701b80d59bcc6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mcpu=gfx1200 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 @esgs_ring = external addrspace(3) global [0 x i32], align 65536
 
@@ -88,58 +90,113 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_write2_b32 v2, v0, v1 offset0:7 offset1:8
 ;
-; GFX11-LABEL: main:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
-; GFX11-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
+; GFX11-TRUE16-LABEL: main:
+; GFX11-TRUE16:       ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-TRUE16-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-TRUE16-NEXT:  ; %bb.2:
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-TRUE16-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
 ;
-; GFX12-LABEL: main:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    s_mov_b32 s1, exec_lo
-; GFX12-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX12-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX12-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
-; GFX12-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT:    ; implicit-def: $vgpr4
-; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX12-NEXT:  ; %bb.2:
-; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
-; GFX12-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
+; GFX11-FAKE16-LABEL: main:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-FAKE16-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-FAKE16-NEXT:  ; %bb.2:
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
+;
+; GFX12-TRUE16-LABEL: main:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-TRUE16-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
+; GFX12-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-TRUE16-NEXT:  ; %bb.2:
+; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v5.h
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX12-TRUE16-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
+;
+; GFX12-FAKE16-LABEL: main:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-FAKE16-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
+; GFX12-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-FAKE16-NEXT:  ; %bb.2:
+; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX12-FAKE16-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
 bb:
   %i = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 poison)
   %i2 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %arg, i32 %arg1, i32 0, i32 0, i32 0)
@@ -163,3 +220,6 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32 immarg) #1
 attributes #0 = { nounwind readnone willreturn }
 attributes #1 = { nounwind readonly willreturn }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
+; GFX12: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 87b83f68d685d..4319bdd5d9b65 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
 ; GFX68-LABEL: buffer_store:
@@ -225,11 +226,17 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1,
 ; GFX68-NEXT:    buffer_store_short v0, v1, s[0:3], 0 idxen
 ; GFX68-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: struct_buffer_store_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 idxen
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: struct_buffer_store_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 idxen
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: struct_buffer_store_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 idxen
+; GFX11-FAKE16-NEXT:    s_endpgm
   %v2 = fptrunc float %v1 to half
   call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index bc50b12b59049..ff5b17f7324cb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=CHECK
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-FAKE16
 
 define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr, i32 %index) {
 ; CHECK-LABEL: struct_ptr_atomic_buffer_load_i32:
@@ -257,29 +259,75 @@ bb2:
 }
 
 define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr, i32 %index) {
-; CHECK-LABEL: struct_ptr_atomic_buffer_load_v4i16:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; CHECK-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v1, s6
-; CHECK-NEXT:  .LBB8_1: ; %bb1
-; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
-; CHECK-NEXT:    s_or_b32 s4, vcc_lo, s4
-; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
-; CHECK-NEXT:    s_cbranch_execnz .LBB8_1
-; CHECK-NEXT:  ; %bb.2: ; %bb2
-; CHECK-NEXT:    s_endpgm
+; CHECK-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; CHECK-SDAG-TRUE16:       ; %bb.0: ; %bb
+; CHECK-SDAG-TRUE16-NEXT:    s_clause 0x1
+; CHECK-SDAG-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; CHECK-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-SDAG-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; CHECK-SDAG-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-SDAG-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-SDAG-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-SDAG-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-SDAG-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-SDAG-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-SDAG-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; CHECK-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; CHECK-FAKE16:       ; %bb.0: ; %bb
+; CHECK-FAKE16-NEXT:    s_clause 0x1
+; CHECK-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; CHECK-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-FAKE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-FAKE16-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-FAKE16-NEXT:  .LBB8_1: ; %bb1
+; CHECK-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-FAKE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-FAKE16-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; CHECK-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-FAKE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-FAKE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-FAKE16-NEXT:    s_endpgm
+;
+; CHECK-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
+; CHECK-GISEL-TRUE16:       ; %bb.0: ; %bb
+; CHECK-GISEL-TRUE16-NEXT:    s_clause 0x1
+; CHECK-GISEL-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; CHECK-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; CHECK-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_mov_b32 s4, 0
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-GISEL-TRUE16-NEXT:  .LBB8_1: ; %bb1
+; CHECK-GISEL-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-GISEL-TRUE16-NEXT:    buffer_load_b64 v[2:3], v1, s[0:3], 0 idxen offset:4 glc
+; CHECK-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; CHECK-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-GISEL-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v0
+; CHECK-GISEL-TRUE16-NEXT:    s_or_b32 s4, vcc_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
+; CHECK-GISEL-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; CHECK-GISEL-TRUE16-NEXT:  ; %bb.2: ; %bb2
+; CHECK-GISEL-TRUE16-NEXT:    s_endpgm
 bb:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
index 68a585cce2e23..ff02c2e2f58e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 @esgs_ring = external addrspace(3) global [0 x i32], align 65536
 
 define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) {
@@ -86,31 +87,58 @@ define amdgpu_gs void @main(ptr addrspace(8) %arg, i32 %arg1) {
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    ds_write2_b32 v2, v0, v1 offset0:7 offset1:8
 ;
-; GFX11-LABEL: main:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX11-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX11-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
-; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX11-NEXT:    ; implicit-def: $vgpr4
-; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
-; GFX11-NEXT:  ; %bb.2:
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
-; GFX11-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
+; GFX11-TRUE16-LABEL: main:
+; GFX11-TRUE16:       ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-TRUE16-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-TRUE16-NEXT:  ; %bb.2:
+; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v5.h
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-TRUE16-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
+;
+; GFX11-FAKE16-LABEL: main:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-FAKE16-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
+; GFX11-FAKE16-NEXT:    buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-FAKE16-NEXT:  ; %bb.2:
+; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
 bb:
   %i = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 poison)
   %i2 = call nsz arcp <3 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v3f16(ptr addrspace(8) %arg, i32 %arg1, i32 0, i32 0, i32 0)
@@ -134,3 +162,5 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 declare <3 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v3f16(ptr addrspace(8), i32, i32, i32, i32 immarg) #1
 attributes #0 = { nounwind readnone willreturn }
 attributes #1 = { nounwind readonly willreturn }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 88ef7a9363930..f44faf4f7edba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -2,14 +2,18 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
 
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-SDAG-LABEL: test_frexp_f16_i32:
@@ -43,27 +47,49 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_f16_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_f16_i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_f16_i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_f16_i32:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_f16_i32:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_f16_i32:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
@@ -77,6 +103,50 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_f16_i32:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_f16_i32:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_f16_i32:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_f16_i32:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   ret { half, i32 } %result
 }
@@ -105,21 +175,37 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
 ; GFX9-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_f16_i32_only_use_fract:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_f16_i32_only_use_fract:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
@@ -131,6 +217,38 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_f16_i32_only_use_fract:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 0
   ret half %result.0
@@ -162,25 +280,45 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_f16_i32_only_use_exp:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_f16_i32_only_use_exp:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
@@ -191,6 +329,46 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_f16_i32_only_use_exp:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i32 } @llvm.frexp.f16.i32(half %a)
   %result.0 = extractvalue { half, i32 } %result, 1
   ret i32 %result.0
@@ -243,39 +421,71 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_v2f16_v2i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v3, v1
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
-; GFX11-NEXT:    v_bfe_i32 v1, v0, 0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v3
-; GFX11-NEXT:    v_bfe_i32 v2, v4, 0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_v2f16_v2i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v3, v1
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
-; GFX12-NEXT:    v_bfe_i32 v1, v0, 0, 16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_pack_b32_f16 v0, v2, v3
-; GFX12-NEXT:    v_bfe_i32 v2, v4, 0, 16
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v3, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v3, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
@@ -320,6 +530,72 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v0, 0, 16
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v4
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v3, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v3, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v4, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_bfe_i32 v1, v0, 0, 16
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_bfe_i32 v2, v4, 0, 16
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
   ret { <2 x half>, <2 x i32> } %result
 }
@@ -357,29 +633,51 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v1, v1
-; GFX12-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
@@ -404,6 +702,52 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
   %result.0 = extractvalue { <2 x half>, <2 x i32> } %result, 0
   ret <2 x half> %result.0
@@ -444,33 +788,57 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
-; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
@@ -505,6 +873,58 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a)
   %result.1 = extractvalue { <2 x half>, <2 x i32> } %result, 1
   ret <2 x i32> %result.1
@@ -540,27 +960,49 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_f16_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_f16_i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_mov_b32_e32 v0, v2
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_f16_i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_f16_i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_f16_i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_f16_i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
@@ -574,6 +1016,50 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_f16_i16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_f16_i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_f16_i16:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v2.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_f16_i16:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v2, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   ret { half, i16 } %result
 }
@@ -602,21 +1088,37 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) {
 ; GFX9-NEXT:    v_frexp_mant_f16_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_f16_i16_only_use_fract:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_f16_i16_only_use_fract:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_fract:
 ; GFX6-GISEL:       ; %bb.0:
@@ -628,6 +1130,38 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) {
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_f16_i16_only_use_fract:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_mant_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   %result.0 = extractvalue { half, i16 } %result, 0
   ret half %result.0
@@ -657,21 +1191,37 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 ; GFX9-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_frexp_f16_i16_only_use_exp:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_frexp_f16_i16_only_use_exp:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_frexp_f16_i16_only_use_exp:
 ; GFX6-GISEL:       ; %bb.0:
@@ -682,6 +1232,38 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
 ; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX12-GISEL-TRUE16:       ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_frexp_f16_i16_only_use_exp:
+; GFX12-GISEL-FAKE16:       ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call { half, i16 } @llvm.frexp.f16.i16(half %a)
   %result.0 = extractvalue { half, i16 } %result, 1
   ret i16 %result.0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index ff8b539fd5ebb..1dd6a7926029e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -5,8 +5,10 @@
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck  -check-prefixes=GFX689,VI,GFX689-GISEL,VI-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX689,GFX900,GFX689-SDAG,GFX900-SDAG %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX689,GFX900,GFX689-GISEL,GFX900-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG,GFX1100-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG,GFX1100-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL,GFX1100-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL,GFX1100-GISEL-FAKE16 %s
 
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
 ; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
@@ -6010,39 +6012,73 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log_f32_from_fpext_f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_f32_from_fpext_f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_f32_from_fpext_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_f32_from_fpext_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_f32_from_fpext_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_f32_from_fpext_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_from_fpext_f16:
 ; R600:       ; %bb.0:
@@ -6179,41 +6215,77 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log_f32_from_fpext_math_f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_f32_from_fpext_math_f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_f32_from_fpext_math_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_f32_from_fpext_math_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3377d1cf, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_f32_from_fpext_math_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_f32_from_fpext_math_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f32_from_fpext_math_f16:
 ; R600:       ; %bb.0:
@@ -6376,13 +6448,37 @@ define half @v_log_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f16:
 ; R600:       ; %bb.0:
@@ -6432,13 +6528,37 @@ define half @v_log_fabs_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_fabs_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_fabs_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_fabs_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_fabs_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fabs_f16:
 ; R600:       ; %bb.0:
@@ -6489,13 +6609,37 @@ define half @v_log_fneg_fabs_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_fneg_fabs_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_fneg_fabs_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_fneg_fabs_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_fneg_fabs_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_fabs_f16:
 ; R600:       ; %bb.0:
@@ -6547,13 +6691,37 @@ define half @v_log_fneg_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_fneg_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, -v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_fneg_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_fneg_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_fneg_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_f16:
 ; R600:       ; %bb.0:
@@ -6604,13 +6772,37 @@ define half @v_log_f16_fast(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_f16_fast:
 ; R600:       ; %bb.0:
@@ -6688,18 +6880,55 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_v2f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v2f16:
 ; R600:       ; %bb.0:
@@ -6794,33 +7023,59 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log_fabs_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, |v1|
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_fabs_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, |v0.h|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_fabs_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, |v1|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_fabs_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_fabs_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fabs_v2f16:
 ; R600:       ; %bb.0:
@@ -6920,33 +7175,59 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log_fneg_fabs_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -|v1|
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_fneg_fabs_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -|v0.h|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_fneg_fabs_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, -|v1|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_fneg_fabs_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_fabs_v2f16:
 ; R600:       ; %bb.0:
@@ -7047,33 +7328,59 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log_fneg_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -v1
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_fneg_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_fneg_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_fneg_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_fneg_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_fneg_v2f16:
 ; R600:       ; %bb.0:
@@ -7152,18 +7459,55 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_v2f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_v2f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_v2f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_v2f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v2f16_fast:
 ; R600:       ; %bb.0:
@@ -7244,21 +7588,65 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_v3f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
-; GFX1100-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_v3f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_v3f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_v3f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v3f16:
 ; R600:       ; %bb.0:
@@ -7339,21 +7727,65 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log_v3f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
-; GFX1100-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_v3f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_v3f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_v3f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v3f16_fast:
 ; R600:       ; %bb.0:
@@ -7481,47 +7913,82 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log_v4f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_v4f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_v4f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_v4f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_v4f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v4f16:
 ; R600:       ; %bb.0:
@@ -7649,47 +8116,82 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log_v4f16_fast:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log_v4f16_fast:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log_v4f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log_v4f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log_v4f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log_v4f16_fast:
 ; R600:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 4f783589f148f..86a58d26c6ae5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -5,8 +5,10 @@
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck  -check-prefixes=GFX689,VI,GFX689-GISEL,VI-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX689,GFX900,GFX689-SDAG,GFX900-SDAG %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX689,GFX900,GFX689-GISEL,GFX900-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG,GFX1100-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG,GFX1100-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL,GFX1100-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL,GFX1100-GISEL-FAKE16 %s
 
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
 ; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
@@ -6010,39 +6012,73 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log10_f32_from_fpext_f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_f32_from_fpext_f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_f32_from_fpext_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_f32_from_fpext_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_f32_from_fpext_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_f32_from_fpext_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_from_fpext_f16:
 ; R600:       ; %bb.0:
@@ -6179,41 +6215,77 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log10_f32_from_fpext_math_f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-SDAG-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_f32_from_fpext_math_f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
-; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_f32_from_fpext_math_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-TRUE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_f32_from_fpext_math_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_fmamk_f32 v2, v0, 0x3284fbcf, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_f32_from_fpext_math_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-TRUE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-TRUE16-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_f32_from_fpext_math_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f32_from_fpext_math_f16:
 ; R600:       ; %bb.0:
@@ -6376,13 +6448,37 @@ define half @v_log10_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f16:
 ; R600:       ; %bb.0:
@@ -6432,13 +6528,37 @@ define half @v_log10_fabs_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_fabs_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_fabs_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_fabs_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_fabs_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fabs_f16:
 ; R600:       ; %bb.0:
@@ -6489,13 +6609,37 @@ define half @v_log10_fneg_fabs_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_fneg_fabs_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_fneg_fabs_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_fneg_fabs_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_fneg_fabs_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_fabs_f16:
 ; R600:       ; %bb.0:
@@ -6547,13 +6691,37 @@ define half @v_log10_fneg_f16(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_fneg_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, -v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_fneg_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_fneg_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_fneg_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_f16:
 ; R600:       ; %bb.0:
@@ -6604,13 +6772,37 @@ define half @v_log10_f16_fast(half %in) {
 ; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_f16_fast:
 ; R600:       ; %bb.0:
@@ -6688,18 +6880,55 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_v2f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v2f16:
 ; R600:       ; %bb.0:
@@ -6794,33 +7023,59 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log10_fabs_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, |v1|
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_fabs_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, |v0.h|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_fabs_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, |v1|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_fabs_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_fabs_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fabs_v2f16:
 ; R600:       ; %bb.0:
@@ -6920,33 +7175,59 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log10_fneg_fabs_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -|v1|
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -|v0.h|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_fneg_fabs_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, -|v1|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_fneg_fabs_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_fabs_v2f16:
 ; R600:       ; %bb.0:
@@ -7047,33 +7328,59 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log10_fneg_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -v1
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_fneg_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_fneg_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_fneg_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_fneg_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_fneg_v2f16:
 ; R600:       ; %bb.0:
@@ -7152,18 +7459,55 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_v2f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_v2f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_v2f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_v2f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v2f16_fast:
 ; R600:       ; %bb.0:
@@ -7244,21 +7588,65 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_v3f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
-; GFX1100-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_v3f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_v3f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_v3f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v3f16:
 ; R600:       ; %bb.0:
@@ -7339,21 +7727,65 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log10_v3f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
-; GFX1100-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_v3f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_v3f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_v3f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_3)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v3f16_fast:
 ; R600:       ; %bb.0:
@@ -7481,47 +7913,82 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log10_v4f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_v4f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_v4f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_v4f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_v4f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v4f16:
 ; R600:       ; %bb.0:
@@ -7649,47 +8116,82 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log10_v4f16_fast:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX1100-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log10_v4f16_fast:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX1100-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log10_v4f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log10_v4f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log10_v4f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log10_v4f16_fast:
 ; R600:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index a98baa2fdb35c..ea88f77f98735 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -5,8 +5,10 @@
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GFX689,VI,GFX689-GISEL,VI-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX689,GFX900,GFX689-SDAG,GFX900-SDAG %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX689,GFX900,GFX689-GISEL,GFX900-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG,GFX1100-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-SDAG,GFX1100-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL,GFX1100-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1100,GFX1100-GISEL,GFX1100-GISEL-FAKE16 %s
 
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
 ; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
@@ -3642,13 +3644,37 @@ define float @v_log2_f32_from_fpext_f16(i16 %src.i) {
 ; GFX689-NEXT:    v_log_f32_e32 v0, v0
 ; GFX689-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_f32_from_fpext_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_f32_from_fpext_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_f32_from_fpext_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_f32_from_fpext_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_f32_from_fpext_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_from_fpext_f16:
 ; R600:       ; %bb.0:
@@ -3709,14 +3735,41 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_f32_from_fpext_math_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1100-NEXT:    v_log_f32_e32 v0, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_f32_from_fpext_math_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_f32_from_fpext_math_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_f32_from_fpext_math_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_f32_from_fpext_math_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f32_from_fpext_math_f16:
 ; R600:       ; %bb.0:
@@ -3837,11 +3890,29 @@ define half @v_log2_f16(half %in) {
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f16:
 ; R600:       ; %bb.0:
@@ -3887,11 +3958,29 @@ define half @v_log2_fabs_f16(half %in) {
 ; GFX900-NEXT:    v_log_f16_e64 v0, |v0|
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_fabs_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_fabs_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_fabs_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_fabs_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fabs_f16:
 ; R600:       ; %bb.0:
@@ -3938,11 +4027,29 @@ define half @v_log2_fneg_fabs_f16(half %in) {
 ; GFX900-NEXT:    v_log_f16_e64 v0, -|v0|
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_fneg_fabs_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_fneg_fabs_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_fneg_fabs_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_fabs_f16:
 ; R600:       ; %bb.0:
@@ -3990,11 +4097,29 @@ define half @v_log2_fneg_f16(half %in) {
 ; GFX900-NEXT:    v_log_f16_e64 v0, -v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_fneg_f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e64 v0, -v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_fneg_f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_fneg_f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_fneg_f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_f16:
 ; R600:       ; %bb.0:
@@ -4041,11 +4166,29 @@ define half @v_log2_f16_fast(half %in) {
 ; GFX900-NEXT:    v_log_f16_e32 v0, v0
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_f16_fast:
 ; R600:       ; %bb.0:
@@ -4119,16 +4262,45 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_v2f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v2f16:
 ; R600:       ; %bb.0:
@@ -4209,28 +4381,48 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log2_fabs_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, |v1|
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log2_fabs_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, |v0.h|
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_fabs_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, |v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, |v1|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_fabs_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_fabs_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fabs_v2f16:
 ; R600:       ; %bb.0:
@@ -4316,28 +4508,48 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -|v1|
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -|v0.h|
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -|v0|
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, -|v1|
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_fneg_fabs_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_fabs_v2f16:
 ; R600:       ; %bb.0:
@@ -4424,28 +4636,48 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log2_fneg_v2f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v0, -v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e64 v1, -v1
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log2_fneg_v2f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_fneg_v2f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v0, -v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e64 v1, -v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_fneg_v2f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_fneg_v2f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_fneg_v2f16:
 ; R600:       ; %bb.0:
@@ -4520,16 +4752,45 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_v2f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_v2f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_v2f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_v2f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v2f16_fast:
 ; R600:       ; %bb.0:
@@ -4615,17 +4876,49 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_v3f16:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_v3f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_v3f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_v3f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v3f16:
 ; R600:       ; %bb.0:
@@ -4711,17 +5004,49 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: v_log2_v3f16_fast:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_v3f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_v3f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_v3f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v3f16_fast:
 ; R600:       ; %bb.0:
@@ -4823,35 +5148,60 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log2_v4f16:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log2_v4f16:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_v4f16:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_v4f16:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_v4f16:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v4f16:
 ; R600:       ; %bb.0:
@@ -4953,35 +5303,60 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-SDAG-LABEL: v_log2_v4f16_fast:
-; GFX1100-SDAG:       ; %bb.0:
-; GFX1100-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-SDAG-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-SDAG-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-GISEL-LABEL: v_log2_v4f16_fast:
-; GFX1100-GISEL:       ; %bb.0:
-; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX1100-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v0, v0
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v2, v2
-; GFX1100-GISEL-NEXT:    v_log_f16_e32 v3, v3
-; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX1100-GISEL-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1100-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast:
+; GFX1100-SDAG-TRUE16:       ; %bb.0:
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-SDAG-FAKE16-LABEL: v_log2_v4f16_fast:
+; GFX1100-SDAG-FAKE16:       ; %bb.0:
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-TRUE16-LABEL: v_log2_v4f16_fast:
+; GFX1100-GISEL-TRUE16:       ; %bb.0:
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
+; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-GISEL-FAKE16-LABEL: v_log2_v4f16_fast:
+; GFX1100-GISEL-FAKE16:       ; %bb.0:
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v1, v1
+; GFX1100-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v2, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_log_f16_e32 v3, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX1100-GISEL-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1100-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; R600-LABEL: v_log2_v4f16_fast:
 ; R600:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
index a467e29d0bff6..64cd94134f8f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX78,GFX7 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GFX78,GFX8 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX78-LABEL: v_powi_f16:
@@ -15,19 +16,33 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX78-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX78-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_powi_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_log_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
-; GFX11-NEXT:    v_exp_f32_e32 v0, v0
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_powi_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_powi_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_log_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_exp_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %l.cast = bitcast i16 %l to half
   %res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r)
   %res.cast = bitcast half %res to i16

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index c0a85bba93b73..c29362898f40e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
 
 define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
@@ -827,25 +828,45 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: round_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_trunc_f16_e32 v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f16_e32 v1, s2, v0
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s3, |v1|, 0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x3c00, s3
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, v1, s2
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: round_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_sub_f16_e32 v0.h, s2, v0.l
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, |v0.h|, 0.5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v1, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: round_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v0, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_sub_f16_e32 v1, s2, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e64 s3, |v1|, 0.5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 0x3c00, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff, v1, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_f16:
 ; R600:       ; %bb.0:
@@ -968,36 +989,67 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: round_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11-NEXT:    v_trunc_f16_e32 v1, s2
-; GFX11-NEXT:    v_trunc_f16_e32 v0, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_f16_e32 v3, s2, v1
-; GFX11-NEXT:    v_sub_f16_e32 v2, s3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s4, |v3|, 0.5
-; GFX11-NEXT:    v_bfi_b32 v2, 0x7fff, v2, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0x3c00, s4
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v3, 0x7fff, v3, s2
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: round_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, s2
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_sub_f16_e32 v1.h, s2, v0.h
+; GFX11-TRUE16-NEXT:    v_sub_f16_e32 v1.l, s3, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s1, |v1.h|, 0.5
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, |v1.l|, 0.5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, 0, 0x3c00, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v2, s2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v1, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.h, v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: round_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v1, s2
+; GFX11-FAKE16-NEXT:    v_trunc_f16_e32 v0, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_sub_f16_e32 v3, s2, v1
+; GFX11-FAKE16-NEXT:    v_sub_f16_e32 v2, s3, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_ge_f16_e64 s4, |v3|, 0.5
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v2, 0x7fff, v2, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v3, 0, 0x3c00, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v3, 0x7fff, v3, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; R600-LABEL: round_v2f16:
 ; R600:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 4031be65fab61..3b0f8523e1b52 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7-HSA %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8-NOHSA %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 ; TODO: NOT AND
 define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
@@ -79,15 +80,25 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_load_i8:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_load_i8:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_load_i8:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
   %ld = load i8, ptr addrspace(4) %in
   store i8 %ld, ptr addrspace(1) %out
@@ -167,15 +178,25 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_load_v2i8:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_load_v2i8:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_load_v2i8:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
 entry:
   %ld = load <2 x i8>, ptr addrspace(4) %in
   store <2 x i8> %ld, ptr addrspace(1) %out
@@ -923,19 +944,33 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
 ;
-; GFX12-LABEL: constant_zextload_v2i8_to_v2i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v2, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v2i8_to_v2i32:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v2i8_to_v2i32:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v2, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, ptr addrspace(1) %out
@@ -5279,16 +5314,27 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_i8_to_i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_i8_to_i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-TRUE16-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_i8_to_i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i64
   store i64 %ext, ptr addrspace(1) %out
@@ -5366,18 +5412,31 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; EG-NEXT:     ASHR * T0.Y, PV.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_i8_to_i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_i8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_i8_to_i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_i8 v0, v2, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_i8_to_i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_i8 v0, v2, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = sext i8 %a to i64
   store i64 %ext, ptr addrspace(1) %out
@@ -5537,18 +5596,31 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; EG-NEXT:     ASHR * T0.Y, PV.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v1i8_to_v1i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_i8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v1i8_to_v1i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_i8 v0, v2, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v1i8_to_v1i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_i8 v0, v2, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, ptr addrspace(1) %out
@@ -5645,19 +5717,33 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 2(2.802597e-45)
 ;
-; GFX12-LABEL: constant_zextload_v2i8_to_v2i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v2i8_to_v2i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v2i8_to_v2i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(1) %out
@@ -5757,22 +5843,39 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; EG-NEXT:     ASHR * T4.W, PV.Z, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
 ;
-; GFX12-LABEL: constant_sextload_v2i8_to_v2i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v4, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 8
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v2i8_to_v2i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v4, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-TRUE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v2i8_to_v2i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v4, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(1) %out
@@ -9063,15 +9166,25 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_i8_to_i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_i8_to_i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_i8_to_i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = zext i8 %a to i16
   store i16 %ext, ptr addrspace(1) %out
@@ -9152,15 +9265,25 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_i8_to_i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_i8_to_i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_i8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_i8_to_i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_i8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a = load i8, ptr addrspace(4) %in
   %ext = sext i8 %a to i16
   store i16 %ext, ptr addrspace(1) %out
@@ -9239,15 +9362,25 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v1i8_to_v1i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v1i8_to_v1i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v1i8_to_v1i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, ptr addrspace(1) %out
@@ -9328,15 +9461,25 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v1i8_to_v1i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_i8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v1i8_to_v1i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_i8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v1i8_to_v1i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_i8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <1 x i8>, ptr addrspace(4) %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, ptr addrspace(1) %out
@@ -9421,22 +9564,39 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v2i8_to_v2i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v2i8_to_v2i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v2i8_to_v2i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, ptr addrspace(1) %out
@@ -9538,22 +9698,39 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v2i8_to_v2i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 16
-; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v2i8_to_v2i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-TRUE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v2i8_to_v2i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX12-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i8>, ptr addrspace(4) %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, ptr addrspace(1) %out
@@ -9763,25 +9940,46 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_INT * T5.Y, PV.W, 0.0, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX12-NEXT:    s_sext_i32_i16 s5, s2
-; GFX12-NEXT:    s_ashr_i32 s4, s2, 24
-; GFX12-NEXT:    s_bfe_i32 s2, s2, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s5, s5, 8
-; GFX12-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
-; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v4i8_to_v4i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s2
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s2, s2
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s5, s5, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s5, s2
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v4i8_to_v4i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s5, s2
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s4, s2, 24
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s2, s2, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s5, s5, 8
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s2, s5
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <4 x i8>, ptr addrspace(4) %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, ptr addrspace(1) %out
@@ -10072,34 +10270,64 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; EG-NEXT:     BFE_INT * T6.Y, PS, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ;
-; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_ashr_i64 s[4:5], s[2:3], 56
-; GFX12-NEXT:    s_lshr_b32 s6, s2, 16
-; GFX12-NEXT:    s_lshr_b32 s7, s3, 16
-; GFX12-NEXT:    s_bfe_i32 s5, s3, 0x80000
-; GFX12-NEXT:    s_sext_i32_i16 s3, s3
-; GFX12-NEXT:    s_ashr_i32 s8, s2, 24
-; GFX12-NEXT:    s_bfe_i32 s9, s2, 0x80000
-; GFX12-NEXT:    s_sext_i32_i16 s2, s2
-; GFX12-NEXT:    s_bfe_i32 s7, s7, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s3, s3, 8
-; GFX12-NEXT:    s_bfe_i32 s6, s6, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s2, s2, 8
-; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
-; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
-; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s9, s2
-; GFX12-NEXT:    s_pack_ll_b32_b16 s5, s6, s8
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT:    v_mov_b32_e32 v2, s3
-; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v8i8_to_v8i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_ashr_i64 s[4:5], s[2:3], 56
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s5, s3, 0x80000
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s8, s2, 24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s2
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s2, s2
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s6, s6, 0x80000
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s9, s9, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s9, s2
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s6, s8
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-TRUE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v8i8_to_v8i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_ashr_i64 s[4:5], s[2:3], 56
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s6, s2, 16
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s7, s3, 16
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s5, s3, 0x80000
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s8, s2, 24
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s9, s2, 0x80000
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s2, s2
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s6, s6, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s7, s4
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s5, s3
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s9, s2
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s6, s8
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <8 x i8>, ptr addrspace(4) %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, ptr addrspace(1) %out
@@ -11491,94 +11719,183 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ; EG-NEXT:     BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
 ; EG-NEXT:    2(2.802597e-45), 8(1.121039e-44)
 ;
-; GFX12-LABEL: constant_sextload_v32i8_to_v32i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s13, s5, 16
-; GFX12-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX12-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX12-NEXT:    s_ashr_i32 s18, s1, 16
-; GFX12-NEXT:    s_bfe_i32 s19, s1, 0x80000
-; GFX12-NEXT:    s_sext_i32_i16 s20, s1
-; GFX12-NEXT:    s_ashr_i32 s21, s0, 24
-; GFX12-NEXT:    s_bfe_i32 s22, s0, 0x80000
-; GFX12-NEXT:    s_sext_i32_i16 s23, s0
-; GFX12-NEXT:    s_ashr_i64 s[0:1], s[4:5], 56
-; GFX12-NEXT:    s_lshr_b32 s12, s4, 16
-; GFX12-NEXT:    s_bfe_i32 s1, s5, 0x80000
-; GFX12-NEXT:    s_sext_i32_i16 s5, s5
-; GFX12-NEXT:    s_bfe_i32 s13, s13, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s5, s5, 8
-; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s13, s0
-; GFX12-NEXT:    s_ashr_i32 s13, s4, 24
-; GFX12-NEXT:    s_bfe_i32 s12, s12, 0x80000
-; GFX12-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
-; GFX12-NEXT:    s_pack_ll_b32_b16 s5, s12, s13
-; GFX12-NEXT:    s_sext_i32_i16 s12, s4
-; GFX12-NEXT:    s_bfe_i32 s4, s4, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s12, s12, 8
-; GFX12-NEXT:    s_ashr_i32 s13, s7, 16
-; GFX12-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
-; GFX12-NEXT:    s_lshr_b32 s12, s13, 8
-; GFX12-NEXT:    s_sext_i32_i16 s13, s7
-; GFX12-NEXT:    s_lshr_b32 s11, s7, 16
-; GFX12-NEXT:    s_bfe_i32 s7, s7, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s13, s13, 8
-; GFX12-NEXT:    s_lshr_b32 s10, s6, 16
-; GFX12-NEXT:    s_bfe_i32 s11, s11, 0x80000
-; GFX12-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
-; GFX12-NEXT:    s_sext_i32_i16 s13, s6
-; GFX12-NEXT:    s_lshr_b32 s14, s2, 16
-; GFX12-NEXT:    s_lshr_b32 s15, s3, 16
-; GFX12-NEXT:    s_ashr_i32 s24, s3, 16
-; GFX12-NEXT:    s_bfe_i32 s25, s3, 0x80000
-; GFX12-NEXT:    s_sext_i32_i16 s3, s3
-; GFX12-NEXT:    s_ashr_i32 s26, s2, 24
-; GFX12-NEXT:    s_bfe_i32 s27, s2, 0x80000
-; GFX12-NEXT:    s_sext_i32_i16 s2, s2
-; GFX12-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
-; GFX12-NEXT:    s_ashr_i32 s12, s6, 24
-; GFX12-NEXT:    s_bfe_i32 s6, s6, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s13, s13, 8
-; GFX12-NEXT:    s_bfe_i32 s10, s10, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s24, s24, 8
-; GFX12-NEXT:    s_bfe_i32 s15, s15, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s3, s3, 8
-; GFX12-NEXT:    s_bfe_i32 s14, s14, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s2, s2, 8
-; GFX12-NEXT:    s_pack_ll_b32_b16 s6, s6, s13
-; GFX12-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
-; GFX12-NEXT:    s_lshr_b32 s18, s18, 8
-; GFX12-NEXT:    s_bfe_i32 s17, s17, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s20, s20, 8
-; GFX12-NEXT:    s_bfe_i32 s16, s16, 0x80000
-; GFX12-NEXT:    s_lshr_b32 s23, s23, 8
-; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT:    s_pack_ll_b32_b16 s15, s15, s24
-; GFX12-NEXT:    s_pack_ll_b32_b16 s3, s25, s3
-; GFX12-NEXT:    s_pack_ll_b32_b16 s14, s14, s26
-; GFX12-NEXT:    s_pack_ll_b32_b16 s2, s27, s2
-; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
-; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT:    s_pack_ll_b32_b16 s17, s17, s18
-; GFX12-NEXT:    s_pack_ll_b32_b16 s18, s19, s20
-; GFX12-NEXT:    s_pack_ll_b32_b16 s16, s16, s21
-; GFX12-NEXT:    s_pack_ll_b32_b16 s19, s22, s23
-; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
-; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v9, s14
-; GFX12-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15
-; GFX12-NEXT:    v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16
-; GFX12-NEXT:    v_dual_mov_b32 v12, s19 :: v_dual_mov_b32 v15, s17
-; GFX12-NEXT:    v_mov_b32_e32 v14, s18
-; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[8:9] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[8:9]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v32i8_to_v32i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s16, s0, 16
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s17, s1, 16
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s18, s1, 16
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s19, s1, 0x80000
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s20, s1
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s21, s0, 24
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s22, s0, 0x80000
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s23, s0
+; GFX12-TRUE16-NEXT:    s_ashr_i64 s[0:1], s[4:5], 56
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s1, s5
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s13, s13, 0x80000
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s5, s5, 0x80000
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s0, s13, s0
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s1, s1, 8
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s13, s4, 24
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s12, s12, 0x80000
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s5, s1
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s5, s12, s13
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s12, s4
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s4, s4, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s12, s12, 8
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s13, s7, 16
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s12, s13, 8
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s13, s7
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s11, s7, 16
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s13, s13, 8
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s10, s6, 16
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s11, s11, 0x80000
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s13, s6
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s14, s2, 16
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s15, s3, 16
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s24, s3, 16
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s25, s3, 0x80000
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s26, s2, 24
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s27, s2, 0x80000
+; GFX12-TRUE16-NEXT:    s_sext_i32_i16 s2, s2
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX12-TRUE16-NEXT:    s_ashr_i32 s12, s6, 24
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s6, s6, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s13, s13, 8
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s10, s10, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s24, s24, 8
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s15, s15, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s14, s14, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s6, s6, s13
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s18, s18, 8
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s17, s17, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s20, s20, 8
+; GFX12-TRUE16-NEXT:    s_bfe_i32 s16, s16, 0x80000
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s23, s23, 8
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s15, s15, s24
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s3, s25, s3
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s14, s14, s26
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s2, s27, s2
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s18
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s18, s19, s20
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s21
+; GFX12-TRUE16-NEXT:    s_pack_ll_b32_b16 s19, s22, s23
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v9, s14
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v12, s19 :: v_dual_mov_b32 v15, s17
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v14, s18
+; GFX12-TRUE16-NEXT:    s_clause 0x3
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[0:3], s[8:9] offset:48
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:32
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[12:15], s[8:9]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v32i8_to_v32i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s13, s5, 16
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s16, s0, 16
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s17, s1, 16
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s18, s1, 16
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s19, s1, 0x80000
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s20, s1
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s21, s0, 24
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s22, s0, 0x80000
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s23, s0
+; GFX12-FAKE16-NEXT:    s_ashr_i64 s[0:1], s[4:5], 56
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s12, s4, 16
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s1, s5, 0x80000
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s13, s13, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s5, s5, 8
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s0, s13, s0
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s13, s4, 24
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s12, s12, 0x80000
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s5, s12, s13
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s12, s4
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s4, s4, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s12, s12, 8
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s13, s7, 16
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s4, s4, s12
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s12, s13, 8
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s13, s7
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s11, s7, 16
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s7, s7, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s13, s13, 8
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s10, s6, 16
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s11, s11, 0x80000
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s7, s7, s13
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s13, s6
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s14, s2, 16
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s15, s3, 16
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s24, s3, 16
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s25, s3, 0x80000
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s26, s2, 24
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s27, s2, 0x80000
+; GFX12-FAKE16-NEXT:    s_sext_i32_i16 s2, s2
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s11, s11, s12
+; GFX12-FAKE16-NEXT:    s_ashr_i32 s12, s6, 24
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s6, s6, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s13, s13, 8
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s10, s10, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s24, s24, 8
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s15, s15, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s14, s14, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s6, s6, s13
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s10, s10, s12
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s18, s18, 8
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s17, s17, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s20, s20, 8
+; GFX12-FAKE16-NEXT:    s_bfe_i32 s16, s16, 0x80000
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s23, s23, 8
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s15, s15, s24
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s3, s25, s3
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s14, s14, s26
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s2, s27, s2
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s17, s17, s18
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s18, s19, s20
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s16, s16, s21
+; GFX12-FAKE16-NEXT:    s_pack_ll_b32_b16 s19, s22, s23
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v9, s14
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s15
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v10, s3 :: v_dual_mov_b32 v13, s16
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v12, s19 :: v_dual_mov_b32 v15, s17
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v14, s18
+; GFX12-FAKE16-NEXT:    s_clause 0x3
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[0:3], s[8:9] offset:48
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[4:7], s[8:9] offset:32
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[8:11], s[8:9] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[12:15], s[8:9]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <32 x i8>, ptr addrspace(4) %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/lrint.ll b/llvm/test/CodeGen/AMDGPU/lrint.ll
index 58f782fd4ecdd..2f8ea71c1d4be 100644
--- a/llvm/test/CodeGen/AMDGPU/lrint.ll
+++ b/llvm/test/CodeGen/AMDGPU/lrint.ll
@@ -4,8 +4,10 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
 
 declare float @llvm.rint.f32(float)
 declare i32 @llvm.lrint.i32.f32(float)
@@ -491,11 +493,41 @@ entry:
 }
 
 define half @intrinsic_frint_half(half %arg) {
-; GCN-LABEL: intrinsic_frint_half:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_rndne_f16_e32 v0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: intrinsic_frint_half:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: intrinsic_frint_half:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: intrinsic_frint_half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: intrinsic_frint_half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: intrinsic_frint_half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: intrinsic_frint_half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = tail call half @llvm.rint.f16(half %arg)
   ret half %res
@@ -518,14 +550,41 @@ define i32 @intrinsic_lrint_i32_f16(half %arg) {
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: intrinsic_lrint_i32_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: intrinsic_lrint_i32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: intrinsic_lrint_i32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: intrinsic_lrint_i32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: intrinsic_lrint_i32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = tail call i32 @llvm.lrint.i32.f16(half %arg)
   ret i32 %res

diff  --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index 7911631483931..8036e32f90eb0 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -3,8 +3,10 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-GISEL-FAKE16 %s
 
 define i32 @intrinsic_lround_i32_f32(float %arg) {
 ; GFX9-SDAG-LABEL: intrinsic_lround_i32_f32:
@@ -816,34 +818,66 @@ define half @intrinsic_fround_half(half %arg) {
 ; GFX10-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: intrinsic_fround_half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: intrinsic_fround_half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_trunc_f16_e32 v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_sub_f16_e32 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, |v1.l|, 0.5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v1.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: intrinsic_fround_half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-FAKE16-LABEL: intrinsic_fround_half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: intrinsic_fround_half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_sub_f16_e32 v1.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, |v1.l|, 0.5
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: intrinsic_fround_half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = tail call half @llvm.round.f16(half %arg)
   ret half %res
@@ -907,40 +941,78 @@ define i32 @intrinsic_lround_i32_f16(half %arg) {
 ; GFX10-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: intrinsic_lround_i32_f16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: intrinsic_lround_i32_f16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_trunc_f16_e32 v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_sub_f16_e32 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, |v1.l|, 0.5
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v1.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: intrinsic_lround_i32_f16:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
-; GFX11-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-FAKE16-LABEL: intrinsic_lround_i32_f16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: intrinsic_lround_i32_f16:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_sub_f16_e32 v1.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, |v1.l|, 0.5
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-GISEL-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: intrinsic_lround_i32_f16:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-FAKE16-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = tail call i32 @llvm.lround.i32.f16(half %arg)
   ret i32 %res

diff  --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index b77bdbf89c7c9..cbd824e171976 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -5,7 +5,9 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
 
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s
+; FIXME-TRUE16. fix gisel
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s
@@ -547,3 +549,5 @@ declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone speculatable }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-GFX11-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index ef325da272005..1e42717f118a6 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -6,7 +6,9 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
 
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-FAKE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
@@ -2723,3 +2725,5 @@ declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone speculatable }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-GFX1100-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index e1e356a92f28e..4e07ac7653fc0 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -7,7 +7,9 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
 
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100,GISEL-GFX1100-FAKE16 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx9-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9GEN,GISEL-GFX9GEN %s
@@ -2762,3 +2764,5 @@ declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind "denormal-fp-math-f32"="ieee,ieee" }
 attributes #2 = { nounwind readnone speculatable }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL-GFX1100-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 64afe3cd01255..718a266f49f5d 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define half @v_maximumnum_f16(half %x, half %y) {
 ; GFX8-LABEL: v_maximumnum_f16:
@@ -30,27 +32,49 @@ define half @v_maximumnum_f16(half %x, half %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.maximumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -74,21 +98,37 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_nnan:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_nnan:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_nnan:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_nnan:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_nnan:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_nnan:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan half @llvm.maximumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -115,25 +155,45 @@ define half @v_maximumnum_f16_1.0(half %x) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, 1.0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_1.0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_1.0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, 1.0, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_1.0:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_1.0:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_1.0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, 1.0, v0.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_1.0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, 1.0, v0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.maximumnum.f16(half %x, half 1.0)
   ret half %result
 }
@@ -232,91 +292,183 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_bf16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_bf16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_bf16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
 }
@@ -370,49 +522,97 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_bf16_nnan:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_bf16_nnan:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_bf16_nnan:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
 }
@@ -854,27 +1054,49 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_s_v:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_s_v:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v1, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_s_v:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, s0, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_s_v:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_s_v:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, s0, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_s_v:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v1, v0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
  %result = call half @llvm.maximumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -904,27 +1126,49 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_v_s:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_v_s:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_v_s:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, s0, s0
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_v_s:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_v_s:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, s0, s0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_v_s:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
  %result = call half @llvm.maximumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -954,27 +1198,49 @@ define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_s_s:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v0, s1, s1
-; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_s_s:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v0, s1, s1
-; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v1, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_s_s:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, s1, s1
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, s0, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_s_s:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v0, s1, s1
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_s_s:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, s1, s1
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, s0, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_s_s:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v0, s1, s1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v1, v0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
  %result = call half @llvm.maximumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -1511,27 +1777,49 @@ define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call half @llvm.fabs.f16(half %y)
   %result = call half @llvm.maximumnum.f16(half %x, half %fabs.y)
   ret half %result
@@ -1562,27 +1850,49 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, -|v1.l|, -|v1.l|
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, -|v1.l|, -|v1.l|
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call half @llvm.fabs.f16(half %y)
   %fneg.fabs.y = fneg half %fabs.y
   %result = call half @llvm.maximumnum.f16(half %x, half %fneg.fabs.y)
@@ -1614,27 +1924,49 @@ define half @v_maximumnum_f16_fabs(half %x, half %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_fabs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_fabs:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
-; GFX12-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_fabs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_fabs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_fabs:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_fabs:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call half @llvm.fabs.f16(half %x)
   %fabs.y = call half @llvm.fabs.f16(half %y)
   %result = call half @llvm.maximumnum.f16(half %fabs.x, half %fabs.y)
@@ -1666,27 +1998,49 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) {
 ; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maximumnum_f16_fneg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_maximumnum_f16_fneg:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
-; GFX12-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_fneg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_fneg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_maximumnum_f16_fneg:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, -v1.l, -v1.l
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, -v0.l, -v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_maximumnum_f16_fneg:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg.x = fneg half %x
   %fneg.y = fneg half %y
   %result = call half @llvm.maximumnum.f16(half %fneg.x, half %fneg.y)

diff  --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 07072f6a36296..d2f4f54cefe78 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -4,7 +4,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
 ; EG-LABEL: v_test_imin_sle_i32:
@@ -713,45 +714,85 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_test_imin_sle_v4i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x28
-; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x4c
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s2, s0
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11-NEXT:    s_sext_i32_i16 s7, s1
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX11-NEXT:    s_ashr_i32 s6, s0, 24
-; GFX11-NEXT:    s_bfe_i32 s0, s0, 0x80000
-; GFX11-NEXT:    s_ashr_i32 s9, s1, 24
-; GFX11-NEXT:    s_bfe_i32 s1, s1, 0x80000
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 8
-; GFX11-NEXT:    s_bfe_i32 s3, s3, 0x80000
-; GFX11-NEXT:    s_lshr_b32 s7, s7, 8
-; GFX11-NEXT:    s_bfe_i32 s8, s8, 0x80000
-; GFX11-NEXT:    v_min_i16 v0, s6, s9
-; GFX11-NEXT:    v_min_i16 v1, s0, s1
-; GFX11-NEXT:    v_min_i16 v2, s3, s8
-; GFX11-NEXT:    v_min_i16 v3, s2, s7
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_test_imin_sle_v4i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[4:5], 0x28
+; GFX11-TRUE16-NEXT:    s_load_b32 s1, s[4:5], 0x4c
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_sext_i32_i16 s2, s0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11-TRUE16-NEXT:    s_sext_i32_i16 s7, s1
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX11-TRUE16-NEXT:    s_ashr_i32 s6, s0, 24
+; GFX11-TRUE16-NEXT:    s_ashr_i32 s9, s1, 24
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX11-TRUE16-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX11-TRUE16-NEXT:    s_bfe_i32 s0, s0, 0x80000
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s7, s7, 8
+; GFX11-TRUE16-NEXT:    s_bfe_i32 s8, s8, 0x80000
+; GFX11-TRUE16-NEXT:    s_bfe_i32 s1, s1, 0x80000
+; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, s6, s9
+; GFX11-TRUE16-NEXT:    v_min_i16 v1.l, s3, s8
+; GFX11-TRUE16-NEXT:    v_min_i16 v2.l, s2, s7
+; GFX11-TRUE16-NEXT:    v_min_i16 v3.l, s0, s1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_test_imin_sle_v4i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[4:5], 0x28
+; GFX11-FAKE16-NEXT:    s_load_b32 s1, s[4:5], 0x4c
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_sext_i32_i16 s2, s0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11-FAKE16-NEXT:    s_sext_i32_i16 s7, s1
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX11-FAKE16-NEXT:    s_ashr_i32 s6, s0, 24
+; GFX11-FAKE16-NEXT:    s_bfe_i32 s0, s0, 0x80000
+; GFX11-FAKE16-NEXT:    s_ashr_i32 s9, s1, 24
+; GFX11-FAKE16-NEXT:    s_bfe_i32 s1, s1, 0x80000
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s2, s2, 8
+; GFX11-FAKE16-NEXT:    s_bfe_i32 s3, s3, 0x80000
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s7, s7, 8
+; GFX11-FAKE16-NEXT:    s_bfe_i32 s8, s8, 0x80000
+; GFX11-FAKE16-NEXT:    v_min_i16 v0, s6, s9
+; GFX11-FAKE16-NEXT:    v_min_i16 v1, s0, s1
+; GFX11-FAKE16-NEXT:    v_min_i16 v2, s3, s8
+; GFX11-FAKE16-NEXT:    v_min_i16 v3, s2, s7
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, ptr addrspace(1) %out
@@ -1249,22 +1290,39 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_imin_slt_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_min_i16 v1, v1, v2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_imin_slt_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[4:5]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_imin_slt_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_i16 v1, v1, v2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
   %b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid
@@ -2372,20 +2430,35 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_umin_ult_i8:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_u8 v2, v0, s[4:5]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_min_u16 v1, v1, v2
-; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_umin_ult_i8:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_d16_hi_u8 v0, v1, s[4:5]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_umin_ult_i8:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    global_load_u8 v2, v0, s[4:5]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_u16 v1, v1, v2
+; GFX11-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index a32b3b71cd606..2b4d687bb0c29 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define half @v_minimumnum_f16(half %x, half %y) {
 ; GFX8-LABEL: v_minimumnum_f16:
@@ -30,27 +32,49 @@ define half @v_minimumnum_f16(half %x, half %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.minimumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -74,21 +98,37 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_nnan:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_nnan:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_nnan:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_nnan:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_nnan:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_nnan:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan half @llvm.minimumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -115,25 +155,45 @@ define half @v_minimumnum_f16_1.0(half %x) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, 1.0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_1.0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_1.0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, 1.0, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_1.0:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_1.0:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_1.0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, 1.0, v0.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_1.0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, 1.0, v0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.minimumnum.f16(half %x, half 1.0)
   ret half %result
 }
@@ -234,91 +294,183 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_bf16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
-; GFX12-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_bf16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_bf16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_bf16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v1, v1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.h, v0.l, s0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, s0
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_bf16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_max_num_f32_e32 v2, v2, v2
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
 }
@@ -374,49 +526,97 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_bf16_nnan:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_bf16_nnan:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_bf16_nnan:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
 }
@@ -858,27 +1058,49 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_v_s:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_v_s:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_v_s:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, s0, s0
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_v_s:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_v_s:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, s0, s0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_v_s:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
  %result = call half @llvm.minimumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -908,27 +1130,49 @@ define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_s_s:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v0, s1, s1
-; GFX11-NEXT:    v_max_f16_e64 v1, s0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_s_s:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v0, s1, s1
-; GFX12-NEXT:    v_max_num_f16_e64 v1, s0, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v1, v0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_s_s:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, s1, s1
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, s0, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_s_s:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v0, s1, s1
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, s0, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_s_s:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, s1, s1
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, s0, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.h, v0.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_s_s:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v0, s1, s1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, s0, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v1, v0
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
  %result = call half @llvm.minimumnum.f16(half %x, half %y)
   ret half %result
 }
@@ -1465,27 +1709,49 @@ define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call half @llvm.fabs.f16(half %y)
   %result = call half @llvm.minimumnum.f16(half %x, half %fabs.y)
   ret half %result
@@ -1516,27 +1782,49 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, -|v1.l|, -|v1.l|
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, -|v1.l|, -|v1.l|
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, -|v1|, -|v1|
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fabs.y = call half @llvm.fabs.f16(half %y)
   %fneg.fabs.y = fneg half %fabs.y
   %result = call half @llvm.minimumnum.f16(half %x, half %fneg.fabs.y)
@@ -1568,27 +1856,49 @@ define half @v_minimumnum_f16_fabs(half %x, half %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_fabs:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_fabs:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
-; GFX12-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_fabs:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_fabs:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v0, |v0|, |v0|
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_fabs:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, |v1.l|, |v1.l|
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_fabs:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, |v1|, |v1|
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v0, |v0|, |v0|
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call half @llvm.fabs.f16(half %x)
   %fabs.y = call half @llvm.fabs.f16(half %y)
   %result = call half @llvm.minimumnum.f16(half %fabs.x, half %fabs.y)
@@ -1620,27 +1930,49 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) {
 ; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minimumnum_f16_fneg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT:    v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: v_minimumnum_f16_fneg:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
-; GFX12-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_fneg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_fneg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v0, -v0, -v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_minimumnum_f16_fneg:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.h, -v1.l, -v1.l
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e64 v0.l, -v0.l, -v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_minimumnum_f16_fneg:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v1, -v1, -v1
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e64 v0, -v0, -v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fneg.x = fneg half %x
   %fneg.y = fneg half %y
   %result = call half @llvm.minimumnum.f16(half %fneg.x, half %fneg.y)

diff  --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index bf450ab6e80c4..61ac1fe92c278 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -1,12 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
@@ -28,23 +32,59 @@ define i8 @flat_inst_valu_offset_1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:1
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:1
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:1
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 1
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -67,23 +107,59 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -106,23 +182,59 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -147,26 +259,47 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -186,6 +319,17 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8191
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -210,26 +354,47 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_24bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8388607
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -249,6 +414,17 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:8388607
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8388607
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -273,26 +449,68 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-2048
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -2048
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -317,26 +535,68 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-4096
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -361,26 +621,68 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8192
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -405,26 +707,68 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_neg_24bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388608
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8388608
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388608
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_neg_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388608
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8388608
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -448,23 +792,59 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -489,26 +869,47 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -528,6 +929,17 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 8191
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -552,26 +964,47 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:16383
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:16383
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:16383
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -591,6 +1024,17 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:16383
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 16383
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -615,29 +1059,53 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4094
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_24bit_max:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8388606
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4094
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4094
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8388606
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8388606
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_24bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -695,26 +1163,68 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-4096
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -739,26 +1249,68 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8192
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -783,26 +1335,68 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-16384
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -16384
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -827,29 +1421,63 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388607
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8388607
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8388607
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff000001, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_2x_neg_24bit_max:
 ; GFX12-GISEL:       ; %bb.0:
@@ -889,29 +1517,53 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -970,29 +1622,53 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2048
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2048
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2048
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1051,29 +1727,53 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1132,29 +1832,63 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4096
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4096
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1194,29 +1928,53 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8191
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1275,29 +2033,63 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8192
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8192
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8192
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1338,29 +2130,53 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8386561
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1372,6 +2188,16 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1411,29 +2237,53 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8386560
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1445,6 +2295,16 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1484,29 +2344,53 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8384513
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1518,6 +2402,16 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1557,29 +2451,53 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8384512
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1591,6 +2509,16 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1630,29 +2558,53 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8380417
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1664,6 +2616,16 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1703,29 +2665,53 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8380416
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1737,6 +2723,16 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX12-GISEL-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1780,25 +2776,65 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_inst_salu_offset_1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:1 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:1 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:1 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: flat_inst_salu_offset_1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:1 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 1
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -1830,25 +2866,65 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -1880,25 +2956,65 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -1932,27 +3048,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -1979,6 +3117,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8191
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2012,27 +3160,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xfffff800, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff800, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff800, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2059,6 +3229,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -2048
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2092,27 +3272,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2139,6 +3341,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2172,27 +3384,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2219,6 +3453,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2250,25 +3494,65 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2302,27 +3586,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2349,6 +3655,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 8191
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2382,27 +3698,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x3000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x3000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x3000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2429,6 +3767,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 16383
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2462,27 +3810,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2509,6 +3879,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2542,27 +3922,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2589,6 +3991,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2622,27 +4034,49 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2669,6 +4103,16 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    flat_store_b8 v[0:1], v0
 ; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-GISEL-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr %p, i64 -16384
   %load = load volatile i8, ptr %gep, align 1
   store i8 %load, ptr poison
@@ -2702,29 +4146,53 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2797,29 +4265,53 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2048 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2892,29 +4384,53 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -2988,29 +4504,53 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3084,29 +4624,53 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3180,29 +4744,53 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3277,31 +4865,57 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3376,31 +4990,57 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3475,31 +5115,57 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3574,31 +5240,57 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3673,31 +5365,57 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3772,31 +5490,57 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ; GFX10-NEXT:    flat_store_byte v[0:1], v0
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    flat_store_b8 v[0:1], v0
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, s1
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX9-GISEL:       ; %bb.0:
@@ -3844,3 +5588,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10-GISEL: {{.*}}
 ; GFX10-SDAG: {{.*}}
+; GFX11: {{.*}}
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX11-GISEL-TRUE16: {{.*}}
+; GFX11-SDAG: {{.*}}
+; GFX12: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-TRUE16: {{.*}}
+; GFX12-SDAG: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index e426bc73af66d..de5f4f931070e 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -1,12 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
 
 ; Test splitting flat instruction offsets into the low and high bits
 ; when the offset doesn't fit in the offset field.
@@ -26,23 +30,59 @@ define i8 @global_inst_valu_offset_1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:1
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:1
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:1
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -63,23 +103,59 @@ define i8 @global_inst_valu_offset_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -102,23 +178,23 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
@@ -128,6 +204,42 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -162,16 +274,16 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: global_inst_valu_offset_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: global_inst_valu_offset_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
@@ -191,15 +303,47 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -234,16 +378,16 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: global_inst_valu_offset_24bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8388607
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: global_inst_valu_offset_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:8388607
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_24bit_max:
 ; GFX9-SDAG:       ; %bb.0:
@@ -263,15 +407,47 @@ define i8 @global_inst_valu_offset_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_24bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8388607
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8388607
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8388607
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -292,23 +468,59 @@ define i8 @global_inst_valu_offset_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-2048
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-2048
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -331,23 +543,59 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4096
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4096
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -372,26 +620,68 @@ define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8192
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -416,26 +706,68 @@ define i8 @global_inst_valu_offset_neg_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_neg_24bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388608
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388608
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8388608
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_neg_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388608
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8388608
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -458,23 +790,23 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_2x_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX10-SDAG:       ; %bb.0:
@@ -484,6 +816,42 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -518,16 +886,16 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9-SDAG:       ; %bb.0:
@@ -547,15 +915,47 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -590,16 +990,16 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:16383
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:16383
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
@@ -619,15 +1019,47 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:16383
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:16383
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -694,29 +1126,53 @@ define i8 @global_inst_valu_offset_2x_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4094
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_24bit_max:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8388606
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4094
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4094
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8388606
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8388606
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 16777214
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -739,23 +1195,59 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4096
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4096
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -780,26 +1272,68 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8192
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8192
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -824,26 +1358,68 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v[0:1], off offset:-16384
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off offset:-16384
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-16384
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-16384
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -910,29 +1486,53 @@ define i8 @global_inst_valu_offset_2x_neg_24bit_max(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff001000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388607
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff001000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff001000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8388607
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_2x_neg_24bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0xff800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8388607
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16777215
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1001,29 +1601,53 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1082,29 +1706,53 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2048
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2048
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2048
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1172,29 +1820,53 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1220,15 +1892,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1244,19 +1916,53 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4096
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4096
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1324,29 +2030,53 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8191
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1372,15 +2102,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1396,19 +2126,53 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8192
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8192
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1478,29 +2242,53 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2049
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386561
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-2049
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-2049
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8386561
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386561
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1561,29 +2349,53 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386560
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-2048
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-2048
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8386560
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8386560
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1653,29 +2465,53 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384513
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8384513
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384513
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1702,15 +2538,15 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1736,19 +2572,53 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384512
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8384512
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8384512
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1818,29 +2688,53 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1)
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380417
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-1
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8380417
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380417
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1867,15 +2761,15 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -1901,19 +2795,53 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380416
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x2000, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:-8380416
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800000, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 0x80000000, v1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:-8380416
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -1940,25 +2868,65 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 1
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -1986,25 +2954,65 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2032,25 +3040,65 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2078,25 +3126,65 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_13bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_13bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2124,25 +3212,65 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2173,25 +3301,25 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX10-GISEL-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2203,6 +3331,46 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2248,15 +3416,15 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2269,17 +3437,49 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2307,25 +3507,65 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2353,25 +3593,65 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x1000
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2399,25 +3679,65 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3000
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2448,25 +3768,25 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX10-GISEL-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2478,6 +3798,46 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
+;
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2523,15 +3883,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2544,17 +3904,49 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2600,15 +3992,15 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-GISEL-NEXT:    s_endpgm
 ;
 ; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
 ; GFX10-SDAG:       ; %bb.0:
@@ -2621,17 +4013,49 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2702,29 +4126,53 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2795,29 +4243,53 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2048 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2888,29 +4360,53 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -2981,29 +4477,53 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3074,29 +4594,53 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:4095 glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3167,29 +4711,53 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
 ; GFX10-SDAG-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-SDAG-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s0
-; GFX12-SDAG-NEXT:    s_wait_alu 0xf1ff
-; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-SDAG-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3222,17 +4790,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_add_u32 s0, s0, 0x7ff
-; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x7ff
+; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3246,18 +4814,55 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x7ff
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x7ff
+; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x7ff
+; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x7ff
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x7ff
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3290,17 +4895,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_add_u32 s0, s0, 0x800
-; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x800
+; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3314,18 +4919,55 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x800
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x800
+; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x800
+; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x800
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x800
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3358,17 +5000,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_add_u32 s0, s0, 0xfff
-; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0xfff
+; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3382,18 +5024,55 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0xfff
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0xfff
+; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0xfff
+; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0xfff
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0xfff
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3426,17 +5105,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_add_u32 s0, s0, 0x1000
-; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x1000
+; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3450,18 +5129,55 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1000
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x1000
+; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x1000
+; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x1000
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x1000
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3494,17 +5210,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_add_u32 s0, s0, 0x1fff
-; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x1fff
+; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3518,18 +5234,55 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x1fff
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x1fff
+; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x1fff
+; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x1fff
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x1fff
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
@@ -3562,17 +5315,17 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_add_u32 s0, s0, 0x2000
-; GFX11-NEXT:    s_addc_u32 s1, s1, 0x80000000
-; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, s0, 0x2000
+; GFX11-GISEL-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-GISEL-NEXT:    s_endpgm
 ;
 ; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
 ; GFX12-GISEL:       ; %bb.0:
@@ -3586,20 +5339,66 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
 ; GFX12-GISEL-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX12-GISEL-NEXT:    s_endpgm
 ;
-; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12-SDAG-NEXT:    s_movk_i32 s2, 0x2000
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT:    s_brev_b32 s3, 1
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b8 v[0:1], v0, off
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX11-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_add_u32 s0, s0, 0x2000
+; GFX11-SDAG-TRUE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_add_u32 s0, s0, 0x2000
+; GFX11-SDAG-FAKE16-NEXT:    s_addc_u32 s1, s1, 0x80000000
+; GFX11-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] glc dlc
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX11-SDAG-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-TRUE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-TRUE16:       ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-TRUE16-NEXT:    s_movk_i32 s2, 0x2000
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-TRUE16-NEXT:    global_load_d16_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-SDAG-FAKE16-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
+; GFX12-SDAG-FAKE16:       ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s2, 0x2000
+; GFX12-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-SDAG-FAKE16-NEXT:    s_brev_b32 s3, 1
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-FAKE16-NEXT:    global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX12-SDAG-FAKE16-NEXT:    s_endpgm
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616
   %load = load volatile i8, ptr addrspace(1) %gep, align 1
   store i8 %load, ptr addrspace(1) poison
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX11-GISEL-TRUE16: {{.*}}
+; GFX11-SDAG: {{.*}}
+; GFX12: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-TRUE16: {{.*}}
+; GFX12-SDAG: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index 4e157b40570ed..dc2a2810c6274 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11PLUS,GFX12,GFX12-FAKE16 %s
 
 ; IEEE bit enabled for compute kernel, so shouldn't use.
 define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #4 {
@@ -1097,13 +1099,51 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11PLUS-LABEL: v_omod_div2_f16_denormals:
-; GFX11PLUS:       ; %bb.0:
-; GFX11PLUS-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11PLUS-NEXT:    v_mul_f16_e32 v0, 0.5, v0
-; GFX11PLUS-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_omod_div2_f16_denormals:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.5, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_omod_div2_f16_denormals:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: v_omod_div2_f16_denormals:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.5, v0.l
+; GFX12-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: v_omod_div2_f16_denormals:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GFX12-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX11-TRUE16PLUS-LABEL: v_omod_div2_f16_denormals:
+; GFX11-TRUE16PLUS:       ; %bb.0:
+; GFX11-TRUE16PLUS-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16PLUS-NEXT:    v_mul_f16_e32 v0.l, 0.5, v0.l
+; GFX11-TRUE16PLUS-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16PLUS-NEXT:    s_endpgm
+; GFX11-FAKE16PLUS-LABEL: v_omod_div2_f16_denormals:
+; GFX11-FAKE16PLUS:       ; %bb.0:
+; GFX11-FAKE16PLUS-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16PLUS-NEXT:    v_mul_f16_e32 v0, 0.5, v0
+; GFX11-FAKE16PLUS-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16PLUS-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %div2 = fmul half %add, 0.5
   store half %div2, ptr addrspace(1) poison
@@ -1130,13 +1170,51 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11PLUS-LABEL: v_omod_mul2_f16_denormals:
-; GFX11PLUS:       ; %bb.0:
-; GFX11PLUS-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11PLUS-NEXT:    v_add_f16_e32 v0, v0, v0
-; GFX11PLUS-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_omod_mul2_f16_denormals:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_omod_mul2_f16_denormals:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: v_omod_mul2_f16_denormals:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: v_omod_mul2_f16_denormals:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v0
+; GFX12-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX11-TRUE16PLUS-LABEL: v_omod_mul2_f16_denormals:
+; GFX11-TRUE16PLUS:       ; %bb.0:
+; GFX11-TRUE16PLUS-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16PLUS-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16PLUS-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16PLUS-NEXT:    s_endpgm
+; GFX11-FAKE16PLUS-LABEL: v_omod_mul2_f16_denormals:
+; GFX11-FAKE16PLUS:       ; %bb.0:
+; GFX11-FAKE16PLUS-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16PLUS-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16PLUS-NEXT:    v_add_f16_e32 v0, v0, v0
+; GFX11-FAKE16PLUS-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16PLUS-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %mul2 = fadd half %add, %add
   store half %mul2, ptr addrspace(1) poison
@@ -1161,11 +1239,39 @@ define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
 ; VI-NEXT:    flat_store_short v[0:1], v0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11PLUS-LABEL: v_omod_div2_f16_no_denormals:
-; GFX11PLUS:       ; %bb.0:
-; GFX11PLUS-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
-; GFX11PLUS-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11PLUS-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_omod_div2_f16_no_denormals:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 div:2
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_omod_div2_f16_no_denormals:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: v_omod_div2_f16_no_denormals:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 div:2
+; GFX12-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: v_omod_div2_f16_no_denormals:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
+; GFX12-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-FAKE16-NEXT:    s_endpgm
+; GFX11-TRUE16PLUS-LABEL: v_omod_div2_f16_no_denormals:
+; GFX11-TRUE16PLUS:       ; %bb.0:
+; GFX11-TRUE16PLUS-NEXT:    v_add_f16_e64 v0.l, v0.l, 1.0 div:2
+; GFX11-TRUE16PLUS-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16PLUS-NEXT:    s_endpgm
+; GFX11-FAKE16PLUS-LABEL: v_omod_div2_f16_no_denormals:
+; GFX11-FAKE16PLUS:       ; %bb.0:
+; GFX11-FAKE16PLUS-NEXT:    v_add_f16_e64 v0, v0, 1.0 div:2
+; GFX11-FAKE16PLUS-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16PLUS-NEXT:    s_endpgm
   %add = fadd half %a, 1.0
   %div2 = fmul half %add, 0.5
   store half %div2, ptr addrspace(1) poison

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 35b55a0addd95..e452af7d60c0c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 declare i64 @_Z13get_global_idj(i32) #0
 
@@ -2622,25 +2623,45 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) {
 ; GFX10-NEXT:  ; %bb.2: ; %end
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: negativeoffsetnullptr:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_mov_b64 s[0:1], src_private_base
-; GFX11-NEXT:    v_add_co_u32 v0, s0, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT:  .LBB8_1: ; %branch
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX11-NEXT:  ; %bb.2: ; %end
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: negativeoffsetnullptr:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX11-TRUE16-NEXT:    v_add_co_u32 v0, s0, -1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:    flat_load_d16_u8 v0, v[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:  .LBB8_1: ; %branch
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %end
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: negativeoffsetnullptr:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[0:1], src_private_base
+; GFX11-FAKE16-NEXT:    v_add_co_u32 v0, s0, -1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:  .LBB8_1: ; %branch
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %end
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %null = select i1 false, ptr %buffer, ptr addrspacecast (ptr addrspace(5) null to ptr)
   %gep = getelementptr i8, ptr %null, i64 -1

diff  --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index e5f8de6bd521d..04eea20993608 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
 
 define <2 x float> @v_repeat_divisor_f32_x2(float %x, float %y, float %D) #0 {
 ; GFX6-LABEL: v_repeat_divisor_f32_x2:
@@ -275,16 +276,27 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_repeat_divisor_f16_x2_arcp:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x2_arcp:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v2.l, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_repeat_divisor_f16_x2_arcp:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp half %x, %D
   %div1 = fdiv arcp half %y, %D
   %insert.0 = insertelement <2 x half> poison, half %div0, i32 0
@@ -550,17 +562,29 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_repeat_divisor_f16_x3_arcp:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_rcp_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX11-NEXT:    v_mul_f16_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, v2, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x3_arcp:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v3.l, v3.l
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v2.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_repeat_divisor_f16_x3_arcp:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v1, v2, v3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp half %x, %D
   %div1 = fdiv arcp half %y, %D
   %div2 = fdiv arcp half %z, %D
@@ -812,18 +836,30 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_repeat_divisor_v2f16_x2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f16_e32 v3, v3
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v2, v2, v3
-; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX11-NEXT:    v_pk_mul_f16 v1, v1, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_repeat_divisor_v2f16_x2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v2.h, v2.h
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v2.l, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v1, v1, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_repeat_divisor_v2f16_x2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v2, v2, v3
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v1, v1, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp <2 x half> %x, %D
   %div1 = fdiv arcp <2 x half> %y, %D
   %shuffle = shufflevector <2 x half> %div0, <2 x half> %div1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -910,26 +946,47 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_repeat_divisor_v3f16_x2:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GFX11-NEXT:    v_rcp_f16_e32 v4, v4
-; GFX11-NEXT:    v_rcp_f16_e32 v5, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_rcp_f16_e32 v6, v6
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v5, v5, 0x7e00
-; GFX11-NEXT:    v_pack_b32_f16 v4, v4, v6
-; GFX11-NEXT:    v_pk_mul_f16 v1, v1, v5
-; GFX11-NEXT:    v_pk_mul_f16 v3, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pk_mul_f16 v2, v2, v4
-; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v4
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_repeat_divisor_v3f16_x2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v4.h, v4.h
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v5.l, v5.l
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v4.l, v4.l
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v5, v5.l, 0x7e00
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v4.l, v4.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v1, v1, v5
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v2, v2, v4
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v3, v3, v5
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_repeat_divisor_v3f16_x2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v4, v4
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v6, v6
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v5, v5, 0x7e00
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v4, v4, v6
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v1, v1, v5
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v3, v3, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v2, v2, v4
+; GFX11-FAKE16-NEXT:    v_pk_mul_f16 v0, v0, v4
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %div0 = fdiv arcp <3 x half> %x, %D
   %div1 = fdiv arcp <3 x half> %y, %D
   %shuffle = shufflevector <3 x half> %div0, <3 x half> %div1, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>

diff  --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 017b37af4cdf2..0a746b0a3f572 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; R600-LABEL: rotl_i32:
@@ -370,20 +371,35 @@ define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX10-NEXT:    global_store_short v[4:5], v0, off offset:8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_rotl_i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v2, v[2:3], off offset:48
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:32
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_sub_nc_u16 v1, 0, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b16 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b16 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT:    global_store_b16 v[4:5], v0, off offset:8
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_rotl_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v[2:3], off offset:48
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off offset:32
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v[4:5], v0, off offset:8
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_rotl_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v2, v[2:3], off offset:48
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off offset:32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v1, 0, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[4:5], v0, off offset:8
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
   %a = load i16, ptr addrspace(1) %arrayidx

diff  --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index db56589b799dd..d6e361d6e297e 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
 ; R600-LABEL: rotr_i32:
@@ -327,20 +328,35 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
 ; GFX10-NEXT:    global_store_short v[4:5], v0, off offset:8
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_rotr_i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v2, v[2:3], off offset:48
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:32
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_sub_nc_u16 v1, 0, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b16 v2, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b16 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX11-NEXT:    global_store_b16 v[4:5], v0, off offset:8
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: test_rotr_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v[2:3], off offset:48
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off offset:32
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_sub_nc_u16 v0.h, 0, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v1.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v[4:5], v0, off offset:8
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_rotr_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v2, v[2:3], off offset:48
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off offset:32
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_sub_nc_u16 v1, 0, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v2, v2, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b16 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[4:5], v0, off offset:8
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
   %a = load i16, ptr addrspace(1) %arrayidx

diff  --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 0aff5ca25149f..59a1fe041bf90 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -4,13 +4,15 @@
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SDAG_GFX6 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=SDAG_GFX7 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=SDAG_GFX8 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG_GFX9 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=SDAG_GFX10PLUS,SDAG_GFX10 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=SDAG_GFX10PLUS,SDAG_GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=SDAG_GFX10PLUS,SDAG_GFX11,SDAG_GFX11-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=SDAG_GFX10PLUS,SDAG_GFX11,SDAG_GFX11-FAKE16 %s
 
 define float @v_roundeven_f32(float %x) {
 ; GFX6-LABEL: v_roundeven_f32:
@@ -357,11 +359,23 @@ define half @v_roundeven_f16(half %x) {
 ; GFX9-NEXT:    v_rndne_f16_e32 v0, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10PLUS-LABEL: v_roundeven_f16:
-; GFX10PLUS:       ; %bb.0:
-; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_roundeven_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_roundeven_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_roundeven_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_f16:
 ; SDAG_GFX6:       ; %bb.0:
@@ -391,11 +405,23 @@ define half @v_roundeven_f16(half %x) {
 ; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG_GFX10PLUS-LABEL: v_roundeven_f16:
-; SDAG_GFX10PLUS:       ; %bb.0:
-; SDAG_GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10PLUS-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+; SDAG_GFX10-LABEL: v_roundeven_f16:
+; SDAG_GFX10:       ; %bb.0:
+; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
+; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG_GFX11-TRUE16-LABEL: v_roundeven_f16:
+; SDAG_GFX11-TRUE16:       ; %bb.0:
+; SDAG_GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; SDAG_GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG_GFX11-FAKE16-LABEL: v_roundeven_f16:
+; SDAG_GFX11-FAKE16:       ; %bb.0:
+; SDAG_GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; SDAG_GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call half @llvm.roundeven.f16(half %x)
   ret half %roundeven
 }
@@ -447,14 +473,22 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_roundeven_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX11-NEXT:    v_rndne_f16_e32 v1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_roundeven_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_roundeven_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX6:       ; %bb.0:
@@ -502,14 +536,22 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG_GFX11-LABEL: v_roundeven_v2f16:
-; SDAG_GFX11:       ; %bb.0:
-; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; SDAG_GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16:
+; SDAG_GFX11-TRUE16:       ; %bb.0:
+; SDAG_GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; SDAG_GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG_GFX11-FAKE16-LABEL: v_roundeven_v2f16:
+; SDAG_GFX11-FAKE16:       ; %bb.0:
+; SDAG_GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v1, v1
+; SDAG_GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x)
   ret <2 x half> %roundeven
 }
@@ -574,15 +616,24 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_roundeven_v2f16_fneg:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX11-NEXT:    v_rndne_f16_e32 v1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_roundeven_v2f16_fneg:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX6:       ; %bb.0:
@@ -638,14 +689,22 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG_GFX11-LABEL: v_roundeven_v2f16_fneg:
-; SDAG_GFX11:       ; %bb.0:
-; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; SDAG_GFX11-NEXT:    v_rndne_f16_e64 v0, -v0
-; SDAG_GFX11-NEXT:    v_rndne_f16_e64 v1, -v1
-; SDAG_GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; SDAG_GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
+; SDAG_GFX11-TRUE16:       ; %bb.0:
+; SDAG_GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e64 v0.h, -v0.h
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e64 v0.l, -v0.l
+; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; SDAG_GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG_GFX11-FAKE16-LABEL: v_roundeven_v2f16_fneg:
+; SDAG_GFX11-FAKE16:       ; %bb.0:
+; SDAG_GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e64 v0, -v0
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e64 v1, -v1
+; SDAG_GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %x.fneg = fneg <2 x half> %x
   %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg)
   ret <2 x half> %roundeven
@@ -719,18 +778,29 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_roundeven_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_rndne_f16_e32 v0, v0
-; GFX11-NEXT:    v_rndne_f16_e32 v1, v1
-; GFX11-NEXT:    v_rndne_f16_e32 v2, v2
-; GFX11-NEXT:    v_rndne_f16_e32 v3, v3
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_roundeven_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v1.h, v1.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_roundeven_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX6-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX6:       ; %bb.0:
@@ -799,18 +869,29 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX10-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG_GFX11-LABEL: v_roundeven_v4f16:
-; SDAG_GFX11:       ; %bb.0:
-; SDAG_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; SDAG_GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v2, v2
-; SDAG_GFX11-NEXT:    v_rndne_f16_e32 v3, v3
-; SDAG_GFX11-NEXT:    v_pack_b32_f16 v0, v0, v2
-; SDAG_GFX11-NEXT:    v_pack_b32_f16 v1, v1, v3
-; SDAG_GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16:
+; SDAG_GFX11-TRUE16:       ; %bb.0:
+; SDAG_GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v1.h, v1.h
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
+; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v1.l, v1.l
+; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
+; SDAG_GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG_GFX11-FAKE16-LABEL: v_roundeven_v4f16:
+; SDAG_GFX11-FAKE16:       ; %bb.0:
+; SDAG_GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG_GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; SDAG_GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v1, v1
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v0, v0
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v2, v2
+; SDAG_GFX11-FAKE16-NEXT:    v_rndne_f16_e32 v3, v3
+; SDAG_GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; SDAG_GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; SDAG_GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
   ret <4 x half> %roundeven
 }
@@ -1289,3 +1370,6 @@ declare half @llvm.fabs.f16(half) #0
 declare float @llvm.fabs.f32(float) #0
 
 attributes #0 = { nounwind readnone speculatable willreturn }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
+; SDAG_GFX11: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
index 9a168c133c552..f7bd5f8d5bfb4 100644
--- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 ; Test if fcmp+select patterns form min/max instructions when allowed
 ; by flags.
@@ -548,17 +549,29 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_f16_safe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_f16_safe:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select i1 %cmp, half %a, half %b
   ret half %val
@@ -582,17 +595,29 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select nnan i1 %cmp, half %a, half %b
   ret half %val
@@ -616,17 +641,29 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select nsz i1 %cmp, half %a, half %b
   ret half %val
@@ -649,15 +686,25 @@ define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) {
 ; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select nnan nsz i1 %cmp, half %a, half %b
   ret half %val
@@ -681,17 +728,29 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_f16_safe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_f16_safe:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge half %a, %b
   %val = select i1 %cmp, half %a, half %b
   ret half %val
@@ -715,17 +774,29 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge half %a, %b
   %val = select nnan i1 %cmp, half %a, half %b
   ret half %val
@@ -749,17 +820,29 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge half %a, %b
   %val = select nsz i1 %cmp, half %a, half %b
   ret half %val
@@ -782,15 +865,25 @@ define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) {
 ; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge half %a, %b
   %val = select nnan nsz i1 %cmp, half %a, half %b
   ret half %val
@@ -825,25 +918,40 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> %
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_v2f16_safe:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
@@ -878,25 +986,40 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_v2f16_nnan_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
   %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
@@ -931,25 +1054,40 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_v2f16_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
   %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
@@ -1020,25 +1158,40 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> %
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s0, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_v2f16_safe:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
@@ -1073,25 +1226,40 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s0, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_v2f16_nnan_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <2 x half> %a, %b
   %val = select nnan <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
@@ -1126,25 +1294,40 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal
 ; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s0, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_v2f16_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <2 x half> %a, %b
   %val = select nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
@@ -1232,34 +1415,53 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> %
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.h, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v1.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_v4f16_safe:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
@@ -1311,34 +1513,53 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.h, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v1.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_v4f16_nnan_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
@@ -1390,34 +1611,53 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.h, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v1.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmin_legacy_ule_v4f16_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
@@ -1517,34 +1757,53 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> %
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s0, v0.h, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s1, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s2, v1.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_v4f16_safe:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
@@ -1596,34 +1855,53 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s0, v0.h, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s1, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s2, v1.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_v4f16_nnan_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <4 x half> %a, %b
   %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
@@ -1675,34 +1953,53 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal
 ; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-TRUE16-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s0, v0.h, v2.h
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s1, v0.l, v2.l
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e64 s2, v1.l, v3.l
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xf1ff
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_test_fmax_legacy_uge_v4f16_nsz_flag:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <4 x half> %a, %b
   %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val

diff  --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index a83ed902f1c9d..ebe6b232bfcbc 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
 
@@ -52,31 +53,57 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_sint_to_fp_i64_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_xor_b32 s4, s2, s3
-; GFX11-NEXT:    s_cls_i32 s5, s3
-; GFX11-NEXT:    s_ashr_i32 s4, s4, 31
-; GFX11-NEXT:    s_add_i32 s5, s5, -1
-; GFX11-NEXT:    s_add_i32 s4, s4, 32
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_u32 s4, s5, s4
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_u32 s2, s2, 1
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT:    s_sub_i32 s2, 32, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_sint_to_fp_i64_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s4, s2, s3
+; GFX11-TRUE16-NEXT:    s_cls_i32 s5, s3
+; GFX11-TRUE16-NEXT:    s_ashr_i32 s4, s4, 31
+; GFX11-TRUE16-NEXT:    s_add_i32 s5, s5, -1
+; GFX11-TRUE16-NEXT:    s_add_i32 s4, s4, 32
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_min_u32 s4, s5, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-TRUE16-NEXT:    s_sub_i32 s2, 32, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_sint_to_fp_i64_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s4, s2, s3
+; GFX11-FAKE16-NEXT:    s_cls_i32 s5, s3
+; GFX11-FAKE16-NEXT:    s_ashr_i32 s4, s4, 31
+; GFX11-FAKE16-NEXT:    s_add_i32 s5, s5, -1
+; GFX11-FAKE16-NEXT:    s_add_i32 s4, s4, 32
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_min_u32 s4, s5, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-FAKE16-NEXT:    s_sub_i32 s2, 32, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %result = sitofp i64 %in to half
   store half %result, ptr addrspace(1) %out
   ret void
@@ -142,36 +169,67 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
 ; GFX8-NEXT:    flat_store_short v[0:1], v3
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_sint_to_fp_i64_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v3, v0, v1
-; GFX11-NEXT:    v_cls_i32_e32 v4, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, -1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 32, v3
-; GFX11-NEXT:    v_min_u32_e32 v3, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_sint_to_fp_i64_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v3, v0, v1
+; GFX11-TRUE16-NEXT:    v_cls_i32_e32 v4, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v3, v4, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_sint_to_fp_i64_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v3, v0, v1
+; GFX11-FAKE16-NEXT:    v_cls_i32_e32 v4, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, 32, v3
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v3, v4, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -747,45 +805,85 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_xor_b32 s7, s2, s3
-; GFX11-NEXT:    s_xor_b32 s9, s0, s1
-; GFX11-NEXT:    s_cls_i32 s6, s3
-; GFX11-NEXT:    s_cls_i32 s8, s1
-; GFX11-NEXT:    s_ashr_i32 s7, s7, 31
-; GFX11-NEXT:    s_ashr_i32 s9, s9, 31
-; GFX11-NEXT:    s_add_i32 s6, s6, -1
-; GFX11-NEXT:    s_add_i32 s8, s8, -1
-; GFX11-NEXT:    s_add_i32 s7, s7, 32
-; GFX11-NEXT:    s_add_i32 s9, s9, 32
-; GFX11-NEXT:    s_min_u32 s6, s6, s7
-; GFX11-NEXT:    s_min_u32 s7, s8, s9
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX11-NEXT:    s_min_u32 s2, s2, 1
-; GFX11-NEXT:    s_min_u32 s0, s0, 1
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GFX11-NEXT:    s_sub_i32 s0, 32, s6
-; GFX11-NEXT:    s_sub_i32 s1, 32, s7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, s0
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_sint_to_fp_v2i64_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s7, s2, s3
+; GFX11-TRUE16-NEXT:    s_xor_b32 s9, s0, s1
+; GFX11-TRUE16-NEXT:    s_cls_i32 s6, s3
+; GFX11-TRUE16-NEXT:    s_cls_i32 s8, s1
+; GFX11-TRUE16-NEXT:    s_ashr_i32 s7, s7, 31
+; GFX11-TRUE16-NEXT:    s_ashr_i32 s9, s9, 31
+; GFX11-TRUE16-NEXT:    s_add_i32 s6, s6, -1
+; GFX11-TRUE16-NEXT:    s_add_i32 s8, s8, -1
+; GFX11-TRUE16-NEXT:    s_add_i32 s7, s7, 32
+; GFX11-TRUE16-NEXT:    s_add_i32 s9, s9, 32
+; GFX11-TRUE16-NEXT:    s_min_u32 s6, s6, s7
+; GFX11-TRUE16-NEXT:    s_min_u32 s7, s8, s9
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
+; GFX11-TRUE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-TRUE16-NEXT:    s_min_u32 s0, s0, 1
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX11-TRUE16-NEXT:    s_sub_i32 s0, 32, s6
+; GFX11-TRUE16-NEXT:    s_sub_i32 s1, 32, s7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, s0
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_sint_to_fp_v2i64_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s7, s2, s3
+; GFX11-FAKE16-NEXT:    s_xor_b32 s9, s0, s1
+; GFX11-FAKE16-NEXT:    s_cls_i32 s6, s3
+; GFX11-FAKE16-NEXT:    s_cls_i32 s8, s1
+; GFX11-FAKE16-NEXT:    s_ashr_i32 s7, s7, 31
+; GFX11-FAKE16-NEXT:    s_ashr_i32 s9, s9, 31
+; GFX11-FAKE16-NEXT:    s_add_i32 s6, s6, -1
+; GFX11-FAKE16-NEXT:    s_add_i32 s8, s8, -1
+; GFX11-FAKE16-NEXT:    s_add_i32 s7, s7, 32
+; GFX11-FAKE16-NEXT:    s_add_i32 s9, s9, 32
+; GFX11-FAKE16-NEXT:    s_min_u32 s6, s6, s7
+; GFX11-FAKE16-NEXT:    s_min_u32 s7, s8, s9
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
+; GFX11-FAKE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-FAKE16-NEXT:    s_min_u32 s0, s0, 1
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX11-FAKE16-NEXT:    s_sub_i32 s0, 32, s6
+; GFX11-FAKE16-NEXT:    s_sub_i32 s1, 32, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, s0
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v1, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-FAKE16-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %result = sitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, ptr addrspace(1) %out
   ret void
@@ -942,82 +1040,161 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
-; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_xor_b32_e32 v9, v2, v3
-; GFX11-NEXT:    v_xor_b32_e32 v11, v0, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v13, v6, v7
-; GFX11-NEXT:    v_xor_b32_e32 v15, v4, v5
-; GFX11-NEXT:    v_cls_i32_e32 v10, v3
-; GFX11-NEXT:    v_cls_i32_e32 v12, v1
-; GFX11-NEXT:    v_cls_i32_e32 v14, v7
-; GFX11-NEXT:    v_cls_i32_e32 v16, v5
-; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
-; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v13
-; GFX11-NEXT:    v_ashrrev_i32_e32 v15, 31, v15
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
-; GFX11-NEXT:    v_add_nc_u32_e32 v14, -1, v14
-; GFX11-NEXT:    v_add_nc_u32_e32 v16, -1, v16
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 32, v13
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 32, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v9, v10, v9
-; GFX11-NEXT:    v_min_u32_e32 v10, v12, v11
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v11, v14, v13
-; GFX11-NEXT:    v_min_u32_e32 v12, v16, v15
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v9
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v10
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
-; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v4
-; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_sint_to_fp_v4i64_to_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v11, v0, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v13, v6, v7
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v15, v4, v5
+; GFX11-TRUE16-NEXT:    v_cls_i32_e32 v10, v3
+; GFX11-TRUE16-NEXT:    v_cls_i32_e32 v12, v1
+; GFX11-TRUE16-NEXT:    v_cls_i32_e32 v14, v7
+; GFX11-TRUE16-NEXT:    v_cls_i32_e32 v16, v5
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v13, 31, v13
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v15, 31, v15
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, -1, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, -1, v16
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v13, 32, v13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v15, 32, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v9, v10, v9
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v10, v12, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v11, v14, v13
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v12, v16, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v2, v2, v9
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v5, v0, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v11
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_sint_to_fp_v4i64_to_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v9, v2, v3
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v11, v0, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v13, v6, v7
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v15, v4, v5
+; GFX11-FAKE16-NEXT:    v_cls_i32_e32 v10, v3
+; GFX11-FAKE16-NEXT:    v_cls_i32_e32 v12, v1
+; GFX11-FAKE16-NEXT:    v_cls_i32_e32 v14, v7
+; GFX11-FAKE16-NEXT:    v_cls_i32_e32 v16, v5
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v13, 31, v13
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v15, 31, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, -1, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, -1, v12
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, -1, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v16, -1, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, 32, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 32, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, 32, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v9, v10, v9
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v10, v12, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v11, v14, v13
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v12, v16, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v7, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v2, v2, v9
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v10
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v11
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v0, v2
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v3, v4
+; GFX11-FAKE16-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
   %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index ef92cf3214e7f..92918f19a98a5 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,FIJI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
 ; CIVI-LABEL: local_store_i56:
@@ -334,13 +335,21 @@ define void @local_store_i13(ptr addrspace(3) %ptr, i13 %arg) #0 {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: local_store_i13:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
-; GFX11-NEXT:    ds_store_b16 v0, v1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: local_store_i13:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0x1fff, v1.l
+; GFX11-TRUE16-NEXT:    ds_store_b16 v0, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: local_store_i13:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
+; GFX11-FAKE16-NEXT:    ds_store_b16 v0, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   store i13 %arg, ptr addrspace(3) %ptr, align 8
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
index df6c1a0aa8c46..40aac82888de2 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
@@ -3,7 +3,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX11-FAKE16 %s
 
 define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
 ; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
@@ -20,11 +21,23 @@ define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
 ; GFX89-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
   ret float %result
 }
@@ -57,13 +70,21 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
   ret <2 x float>   %result
 }
@@ -103,15 +124,24 @@ define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v3
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
   ret <3 x float>   %result
 }
@@ -189,12 +219,26 @@ define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 {
 ; GFX89-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1011-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call double @llvm.experimental.constrained.fpext.f64.f16(half %arg, metadata !"fpexcept.strict")
   ret double %result
 }
@@ -231,15 +275,24 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half> %arg, metadata !"fpexcept.strict")
   ret <2 x double>   %result
 }
@@ -284,17 +337,28 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal
 ; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f16(<3 x half> %arg, metadata !"fpexcept.strict")
   ret <3 x double>   %result
 }
@@ -316,12 +380,26 @@ define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0
 ; GFX89-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX1011-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict")
   %neg.result = fneg float %result
   ret float %neg.result
@@ -343,11 +421,23 @@ define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0
 ; GFX89-NEXT:    v_cvt_f32_f16_e64 v0, -v0
 ; GFX89-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1011-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
-; GFX1011:       ; %bb.0:
-; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1011-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX1011-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %neg.arg = fneg half %arg
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %neg.arg, metadata !"fpexcept.strict")
   ret float %result
@@ -413,13 +503,21 @@ define float @v_constrained_fpext_f16_to_f32_noabi(ptr addrspace(1) %ptr) #0 {
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_constrained_fpext_f16_to_f32_noabi:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_constrained_fpext_f16_to_f32_noabi:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_f16_to_f32_noabi:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = load half, ptr addrspace(1) %ptr
   %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict")
   ret float %result
@@ -467,15 +565,24 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_noabi(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x half>, ptr addrspace(1) %ptr
   %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict")
   ret <2 x float> %result
@@ -495,3 +602,5 @@ declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>,
 
 attributes #0 = { strictfp }
 attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 8118441df0cfc..a3aeea8a145cd 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
 
@@ -432,21 +433,37 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: test_sub_i16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_sub_nc_u16 v0, v1, v0
-; GFX12-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: test_sub_i16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3] scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v1, s[2:3] offset:2 scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    v_sub_nc_u16 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_sub_i16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_sub_nc_u16 v0, v1, v0
+; GFX12-FAKE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
   %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 2d5e1bb483421..5b1a5206c3403 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
 
@@ -44,27 +45,49 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_uint_to_fp_i64_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u32 s4, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_u32 s4, s4, 32
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_u32 s2, s2, 1
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT:    s_sub_i32 s2, 32, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_uint_to_fp_i64_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clz_i32_u32 s4, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_min_u32 s4, s4, 32
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-TRUE16-NEXT:    s_sub_i32 s2, 32, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_uint_to_fp_i64_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clz_i32_u32 s4, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_min_u32 s4, s4, 32
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-FAKE16-NEXT:    s_sub_i32 s2, 32, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %result = uitofp i64 %in to half
   store half %result, ptr addrspace(1) %out
   ret void
@@ -122,30 +145,55 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
 ; GFX8-NEXT:    flat_store_short v[0:1], v3
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_uint_to_fp_i64_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v3, v1
-; GFX11-NEXT:    v_min_u32_e32 v3, 32, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_uint_to_fp_i64_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_clz_i32_u32_e32 v3, v1
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_uint_to_fp_i64_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_clz_i32_u32_e32 v3, v1
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v3, 32, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -606,37 +654,69 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u32 s6, s3
-; GFX11-NEXT:    s_clz_i32_u32 s7, s1
-; GFX11-NEXT:    s_min_u32 s6, s6, 32
-; GFX11-NEXT:    s_min_u32 s7, s7, 32
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
-; GFX11-NEXT:    s_min_u32 s2, s2, 1
-; GFX11-NEXT:    s_min_u32 s0, s0, 1
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GFX11-NEXT:    s_sub_i32 s0, 32, s6
-; GFX11-NEXT:    s_sub_i32 s1, 32, s7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, s0
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_uint_to_fp_v2i64_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clz_i32_u32 s6, s3
+; GFX11-TRUE16-NEXT:    s_clz_i32_u32 s7, s1
+; GFX11-TRUE16-NEXT:    s_min_u32 s6, s6, 32
+; GFX11-TRUE16-NEXT:    s_min_u32 s7, s7, 32
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
+; GFX11-TRUE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-TRUE16-NEXT:    s_min_u32 s0, s0, 1
+; GFX11-TRUE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GFX11-TRUE16-NEXT:    s_sub_i32 s0, 32, s6
+; GFX11-TRUE16-NEXT:    s_sub_i32 s1, 32, s7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, s0
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_uint_to_fp_v2i64_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clz_i32_u32 s6, s3
+; GFX11-FAKE16-NEXT:    s_clz_i32_u32 s7, s1
+; GFX11-FAKE16-NEXT:    s_min_u32 s6, s6, 32
+; GFX11-FAKE16-NEXT:    s_min_u32 s7, s7, 32
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
+; GFX11-FAKE16-NEXT:    s_min_u32 s2, s2, 1
+; GFX11-FAKE16-NEXT:    s_min_u32 s0, s0, 1
+; GFX11-FAKE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GFX11-FAKE16-NEXT:    s_sub_i32 s0, 32, s6
+; GFX11-FAKE16-NEXT:    s_sub_i32 s1, 32, s7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, s0
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v1, v1, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-FAKE16-NEXT:    global_store_b32 v2, v0, s[4:5]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %result = uitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, ptr addrspace(1) %out
   ret void
@@ -761,65 +841,127 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
-; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v3
-; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
-; GFX11-NEXT:    v_clz_i32_u32_e32 v12, v5
-; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
-; GFX11-NEXT:    v_min_u32_e32 v12, 32, v12
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
-; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
-; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
-; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v9
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v10
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
-; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v1, v0, v2
-; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v4
-; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_uint_to_fp_v4i64_to_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_clz_i32_u32_e32 v9, v3
+; GFX11-TRUE16-NEXT:    v_clz_i32_u32_e32 v10, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_clz_i32_u32_e32 v11, v7
+; GFX11-TRUE16-NEXT:    v_clz_i32_u32_e32 v12, v5
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v9, 32, v9
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v10, 32, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v11, 32, v11
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v12, 32, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-TRUE16-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX11-TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v7, v6
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v2, v2, v9
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v5, v0, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v11
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v3, v3, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v3
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; GFX11-TRUE16-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_uint_to_fp_v4i64_to_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
+; GFX11-FAKE16-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_clz_i32_u32_e32 v9, v3
+; GFX11-FAKE16-NEXT:    v_clz_i32_u32_e32 v10, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_clz_i32_u32_e32 v11, v7
+; GFX11-FAKE16-NEXT:    v_clz_i32_u32_e32 v12, v5
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v9, 32, v9
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v10, 32, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v11, 32, v11
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v12, 32, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
+; GFX11-FAKE16-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX11-FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v1, v7, v6
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v2, v2, v9
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v0, v0, v10
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v1, v1, v11
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v0, v2
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v3, v4
+; GFX11-FAKE16-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
   %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid

diff  --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index a3f632267ccd6..a41063f467d01 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare half @llvm.fabs.f16(half)
@@ -2226,49 +2228,93 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
 ; GFX10-NEXT:    global_store_short v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_cndmask_abs_neg_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: v_cndmask_abs_neg_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
-; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v0, s[0:1]
-; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX12-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
-; GFX12-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX12-NEXT:    global_store_b16 v2, v0, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_cndmask_abs_neg_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s[2:3]
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_cndmask_abs_neg_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX11-FAKE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: v_cndmask_abs_neg_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-TRUE16-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b16 v0.h, 0x7fff, v0.l
+; GFX12-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s[2:3]
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: v_cndmask_abs_neg_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX12-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX12-FAKE16-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX12-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX12-FAKE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
   %f = load half, ptr addrspace(1) %f.gep

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
index 1d921b0d6e254..dfa50ce55a521 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_add_v2i8(<2 x i8> %v) {
 ; GFX7-LABEL: test_vector_reduce_add_v2i8:
@@ -37,21 +42,53 @@ define i8 @test_vector_reduce_add_v2i8(<2 x i8> %v) {
 ; GFX10-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_add_v2i8:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: test_vector_reduce_add_v2i8:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_vector_reduce_add_v2i8:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_vector_reduce_add_v2i8:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v)
   ret i8 %res
@@ -114,13 +151,21 @@ define i8 @test_vector_reduce_add_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -130,17 +175,29 @@ define i8 @test_vector_reduce_add_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -240,21 +297,39 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -266,25 +341,47 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -418,26 +515,49 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.h, v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.h, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v2.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -454,30 +574,57 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.h, v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.h, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v2.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -680,35 +827,66 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v3, v3, v11
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v4, v4, v12
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v6, v6, v14
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v2, v2, v10
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.h, v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v3.h, v7.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v3.l, v3.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v2.h, v6.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v2.l, v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.h, v4.l, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.h, v3.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.h, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v2.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v3, v3, v11
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v6, v6, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v2, v2, v10
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -735,39 +913,74 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v3, v3, v11
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v4, v4, v12
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v6, v6, v14
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v2, v2, v10
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.h, v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v3.h, v7.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v3.l, v3.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v2.h, v6.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v2.l, v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.h, v4.l, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.h, v3.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.h, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v1.l, v2.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v3, v3, v11
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v6, v6, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v2, v2, v10
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -858,13 +1071,19 @@ define i16 @test_vector_reduce_add_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -874,17 +1093,27 @@ define i16 @test_vector_reduce_add_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -960,14 +1189,22 @@ define i16 @test_vector_reduce_add_v3i16(<3 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v3i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v3i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -978,18 +1215,30 @@ define i16 @test_vector_reduce_add_v3i16(<3 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v3i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v3i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1082,14 +1331,22 @@ define i16 @test_vector_reduce_add_v4i16(<4 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v4i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1100,18 +1357,30 @@ define i16 @test_vector_reduce_add_v4i16(<4 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v4i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v4i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1230,17 +1499,27 @@ define i16 @test_vector_reduce_add_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1254,21 +1533,35 @@ define i16 @test_vector_reduce_add_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1438,22 +1731,37 @@ define i16 @test_vector_reduce_add_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_add_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v1, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_add_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_add_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1472,26 +1780,45 @@ define i16 @test_vector_reduce_add_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_add_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_add_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_nc_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_add_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_add_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_add_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_add_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -3132,3 +3459,6 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
index 4eba4ff954b1f..801324eec454e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_and_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_and_v2i8:
@@ -69,18 +74,31 @@ define i8 @test_vector_reduce_and_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -88,22 +106,39 @@ define i8 @test_vector_reduce_and_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -176,13 +211,21 @@ define i8 @test_vector_reduce_and_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -192,17 +235,29 @@ define i8 @test_vector_reduce_and_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -293,15 +348,25 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -313,19 +378,33 @@ define i8 @test_vector_reduce_and_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -450,20 +529,35 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v2.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -480,24 +574,43 @@ define i8 @test_vector_reduce_and_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v2.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -691,29 +804,53 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v6, v6, v14
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, v2, v10
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, v3, v11
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v4, v4, v12
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v5.l, v7.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v6.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v2.h, v3.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, v4.l, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, v2.h, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, v6, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v11
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -740,33 +877,61 @@ define i8 @test_vector_reduce_and_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v6, v6, v14
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, v2, v10
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, v3, v11
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v4, v4, v12
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v5.l, v7.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v6.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v2.h, v3.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, v4.l, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, v2.h, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, v6, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v10
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v11
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -855,13 +1020,22 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -871,17 +1045,30 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -936,27 +1123,73 @@ define i16 @test_vector_reduce_and_v3i16(<3 x i16> %v) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_and_v3i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_vector_reduce_and_v3i16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: test_vector_reduce_and_v3i16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_vector_reduce_and_v3i16:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.and.v3i16(<3 x i16> %v)
   ret i16 %res
@@ -1018,27 +1251,73 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) {
 ; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_and_v4i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_vector_reduce_and_v4i16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: test_vector_reduce_and_v4i16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_vector_reduce_and_v4i16:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v)
   ret i16 %res
@@ -1134,17 +1413,30 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1158,21 +1450,38 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1332,22 +1641,40 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_and_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_and_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1366,26 +1693,48 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_and_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_and_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_and_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -3116,3 +3465,6 @@ declare i64 @llvm.vector.reduce.and.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
index 64c396f14bc70..bce7c1e5e8ab7 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
@@ -7,10 +7,14 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v2half:
@@ -60,27 +64,91 @@ define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) {
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fadd_v2half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fadd_v2half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v2half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fadd.v2half(half %sp, <2 x half> %v)
   ret half %res
@@ -144,53 +212,97 @@ define half @test_vector_reduce_fadd_v3half(half %sp, <3 x half> %v) {
 ; GFX10-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fadd_v3half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fadd_v3half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v3
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fadd_v3half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fadd_v3half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_add_f16_e32 v0, v0, v3
-; GFX12-GISEL-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v3half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fadd.v3half(half %sp, <3 x half> %v)
   ret half %res
@@ -264,35 +376,121 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) {
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fadd_v4half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fadd_v4half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v4half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fadd.v4half(half %sp, <4 x half> %v)
   ret half %res
@@ -406,51 +604,181 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) {
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fadd_v8half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fadd_v8half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v8half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fadd.v8half(half %sp, <8 x half> %v)
   ret half %res
@@ -644,83 +972,301 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) {
 ; GFX10-NEXT:    v_add_f16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fadd_v16half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fadd_v16half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v8.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v6
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v7.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v8.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v5
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v7
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v8
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v6.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v7.h
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v8.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v5.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v6.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v7.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v8.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fadd_v16half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v4
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v5
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v6
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v7
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v8
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fadd.v16half(half %sp, <16 x half> %v)
   ret half %res

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
index 94ebd1184b594..d500a3e50f9f7 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
@@ -7,10 +7,14 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define half @test_vector_reduce_fmax_v2half(<2 x half> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_fmax_v2half:
@@ -80,29 +84,97 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmax_v2half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmax_v2half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v2half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmax.v2half(<2 x half> %v)
   ret half %res
@@ -197,65 +269,119 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v3half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmax_v3half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmax_v3half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmax_v3half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v3half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmax.v3half(<3 x half> %v)
   ret half %res
@@ -362,71 +488,127 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v4half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmax_v4half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmax_v4half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmax_v4half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v1.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v1.h, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v4half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmax.v4half(<4 x half> %v)
   ret half %res
@@ -599,99 +781,181 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v8half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmax_v8half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v7
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v2, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmax_v8half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmax_v8half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v7
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v1.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.h, v2.h, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.h, v3.h, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v3.l, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v1.h, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.h, v2.h, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.h, v3.h, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v3.l, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v8half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v5, v5, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v6, v6, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v7, v7, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v6
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmax.v8half(<8 x half> %v)
   ret half %res
@@ -996,161 +1260,285 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmax_v16half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v8, v8, v8
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_f16 v0, v0, v1, v9
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-SDAG-NEXT:    v_max3_f16 v0, v0, v2, v8
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_f16 v0, v0, v3, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX11-SDAG-NEXT:    v_max3_f16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_f16 v0, v0, v5, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX11-SDAG-NEXT:    v_max3_f16 v0, v0, v6, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max3_f16 v0, v0, v7, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmax_v16half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v9
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v8, v11, v11
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v9, v12, v12
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v10, v13, v13
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v11, v14, v14
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v12, v15, v15
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v8
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v4, v4, v9
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v5, v5, v10
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v6, v6, v11
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v7, v7, v12
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v2, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v6, v7
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v2, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmax_v16half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v8, v8, v8
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_num_f16 v0, v0, v1, v9
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX12-SDAG-NEXT:    v_max3_num_f16 v0, v0, v2, v8
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_num_f16 v0, v0, v3, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX12-SDAG-NEXT:    v_max3_num_f16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_num_f16 v0, v0, v5, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX12-SDAG-NEXT:    v_max3_num_f16 v0, v0, v6, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max3_num_f16 v0, v0, v7, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmax_v16half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v9
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v8, v11, v11
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v9, v12, v12
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v10, v13, v13
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v11, v14, v14
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v12, v15, v15
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v8
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v4, v4, v9
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v5, v5, v10
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v6, v6, v11
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v7, v7, v12
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v6, v7
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v2.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v3.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v4.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v5.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v6.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v7.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v8, v8, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_f16 v0, v0, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_f16 v0, v0, v2, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_f16 v0, v0, v3, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_f16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_f16 v0, v0, v5, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_f16 v0, v0, v6, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_f16 v0, v0, v7, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v1.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.h, v2.h, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v3.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.l, v3.h, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.l, v4.h, v4.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.h, v5.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v4.l, v5.h, v5.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v4.h, v6.l, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v5.l, v6.h, v6.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v5.h, v7.l, v7.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v6.l, v7.h, v7.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v1.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.l, v2.h, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.h, v3.h, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.l, v4.h, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.h, v5.h, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v2.l, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v3.l, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v8, v8, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v9, v9, v9
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v10, v10, v10
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v9
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v10
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v8, v11, v11
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v9, v12, v12
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v10, v13, v13
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v11, v14, v14
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v12, v15, v15
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v9
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v5, v5, v10
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v6, v6, v11
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v7, v7, v12
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v4, v5
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v6, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v2.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v3.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v4.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v5.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v6.l, v6.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v7.l, v7.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v8, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v2, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v3, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v5, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v6, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v7, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v1.h, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.h, v2.h, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v3.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v3.h, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v4.h, v4.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.h, v5.l, v5.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, v5.h, v5.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v4.h, v6.l, v6.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v5.l, v6.h, v6.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v5.h, v7.l, v7.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v6.l, v7.h, v7.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v1.h, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.h, v3.h, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v4.h, v5.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.h, v5.h, v6.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v3.l, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmax_v16half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v8, v8, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v9, v9, v9
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v10, v10, v10
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v9
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v10
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v8, v11, v11
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v9, v12, v12
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v5, v5, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v10, v13, v13
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v6, v6, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v11, v14, v14
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v7, v7, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v12, v15, v15
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v9
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v5, v5, v10
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v6, v6, v11
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v7, v7, v12
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v4, v5
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v6, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmax.v16half(<16 x half> %v)
   ret half %res
@@ -3192,6 +3580,8 @@ declare double @llvm.vector.reduce.fmax.v8double(<8 x double>)
 declare double @llvm.vector.reduce.fmax.v16double(<16 x double>)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
 ; GFX7: {{.*}}
 ; GFX8: {{.*}}
 ; GFX9: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
index c486f29ce60f3..389df695ba324 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
@@ -3,8 +3,10 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
 
 define half @test_vector_reduce_fmaximum_v2half(<2 x half> %v) {
 ; GFX7-LABEL: test_vector_reduce_fmaximum_v2half:
@@ -48,27 +50,46 @@ define half @test_vector_reduce_fmaximum_v2half(<2 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmaximum_v2half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmaximum_v2half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v2half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v2half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v2half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v2half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmaximum.v2half(<2 x half> %v)
   ret half %res
@@ -131,36 +152,64 @@ define half @test_vector_reduce_fmaximum_v3half(<3 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmaximum_v3half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v3, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmaximum_v3half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s0, 0xfc00
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v3half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v3half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v3, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v3half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v2, 0xfc00
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v3half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_mov_b32 s0, 0xfc00
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmaximum.v3half(<3 x half> %v)
   ret half %res
@@ -239,37 +288,66 @@ define half @test_vector_reduce_fmaximum_v4half(<4 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmaximum_v4half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v3, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmaximum_v4half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v4half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v4half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v3, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v4half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v4half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmaximum.v4half(<4 x half> %v)
   ret half %res
@@ -410,58 +488,105 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmaximum_v8half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v5, v0, v4
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v4, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v5
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v4
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmaximum_v8half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v8half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v8half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v5, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v4, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v8half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v8half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmaximum.v8half(<8 x half> %v)
   ret half %res
@@ -726,99 +851,183 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmaximum_v16half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v9, v0, v8
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v8, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v9
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v8
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v4
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v5
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v6
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v7
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmaximum_v16half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_pk_maximum_f16 v3, v3, v7
-; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
-; GFX12-NEXT:    v_pk_maximum_f16 v2, v2, v6
-; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
-; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_maximum_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v16half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v16half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v9, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v8, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmaximum_v16half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmaximum_v16half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_maximum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmaximum.v16half(<16 x half> %v)
   ret half %res

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
index 502d5840d24bb..56e7e045e40eb 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
@@ -7,10 +7,14 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define half @test_vector_reduce_fmin_v2half(<2 x half> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_fmin_v2half:
@@ -80,29 +84,97 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v1, v0
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmin_v2half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmin_v2half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v2half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmin.v2half(<2 x half> %v)
   ret half %res
@@ -197,65 +269,119 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v3half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmin_v3half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmin_v3half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
-; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmin_v3half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, 0x7e00
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v3half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmin.v3half(<3 x half> %v)
   ret half %res
@@ -362,71 +488,127 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v4half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmin_v4half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmin_v4half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmin_v4half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v1, v1, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v1.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v1.h, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v4half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v1, v1, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmin.v4half(<4 x half> %v)
   ret half %res
@@ -599,99 +781,181 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v8half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmin_v8half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v1, v1, v5
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v2, v2, v6
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v3, v3, v7
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v1, v2, v3
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmin_v8half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmin_v8half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v1, v1, v5
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v2, v2, v6
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v3, v3, v7
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v1.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.h, v2.h, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.l, v3.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.h, v3.h, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.l, v2.l, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.h, v3.l, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, v1, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v2, v2, v6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v3, v3, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_num_f16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_num_f16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_num_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v1.h, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.h, v2.h, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v3.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.h, v3.h, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v1.h, v3.l, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v8half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v5, v5, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v6, v6, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v7, v7, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v1, v1, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v2, v2, v6
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v3, v3, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmin.v8half(<8 x half> %v)
   ret half %res
@@ -996,161 +1260,285 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) {
 ; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmin_v16half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f16_e32 v8, v8, v8
-; GFX11-SDAG-NEXT:    v_min_f16_e32 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_f16 v0, v0, v1, v9
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-SDAG-NEXT:    v_min3_f16 v0, v0, v2, v8
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_f16 v0, v0, v3, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX11-SDAG-NEXT:    v_min3_f16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_f16 v0, v0, v5, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX11-SDAG-NEXT:    v_min3_f16 v0, v0, v6, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min3_f16 v0, v0, v7, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmin_v16half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v1, v1, v9
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v8, v11, v11
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v9, v12, v12
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v10, v13, v13
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v11, v14, v14
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_max_f16_e32 v12, v15, v15
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v3, v3, v8
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v4, v4, v9
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v5, v5, v10
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v6, v6, v11
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v7, v7, v12
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v1, v2, v3
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v2, v4, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v3, v6, v7
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v1, v2, v3
-; GFX11-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmin_v16half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_num_f16_e32 v8, v8, v8
-; GFX12-SDAG-NEXT:    v_min_num_f16_e32 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_num_f16 v0, v0, v1, v9
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX12-SDAG-NEXT:    v_min3_num_f16 v0, v0, v2, v8
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_num_f16 v0, v0, v3, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX12-SDAG-NEXT:    v_min3_num_f16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_num_f16 v0, v0, v5, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX12-SDAG-NEXT:    v_min3_num_f16 v0, v0, v6, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min3_num_f16 v0, v0, v7, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmin_v16half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v1, v1, v9
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v8, v11, v11
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v9, v12, v12
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v10, v13, v13
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v11, v14, v14
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_max_num_f16_e32 v12, v15, v15
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v3, v3, v8
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v4, v4, v9
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v5, v5, v10
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v6, v6, v11
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v7, v7, v12
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v2, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v3, v6, v7
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_min_num_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v2.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v3.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v4.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v5.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v6.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v7.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_f16_e32 v8, v8, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_f16 v0, v0, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_f16 v0, v0, v2, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_f16 v0, v0, v3, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_f16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_f16 v0, v0, v5, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_f16 v0, v0, v6, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_f16 v0, v0, v7, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v1.h, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.l, v2.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.h, v2.h, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v1.h, v3.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.l, v2.l, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.l, v3.h, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v2.h, v4.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.l, v4.h, v4.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v3.h, v5.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v4.l, v5.h, v5.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v4.h, v6.l, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v5.l, v6.h, v6.h
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v5.h, v7.l, v7.l
+; GFX11-GISEL-TRUE16-NEXT:    v_max_f16_e32 v6.l, v7.h, v7.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.h, v1.h, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v2.l, v2.h, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v2.h, v3.h, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v3.l, v4.h, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v3.h, v5.h, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.l, v2.l, v2.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v1.h, v3.l, v3.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.h, v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v8, v8, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v9, v9, v9
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v2, v2, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v10, v10, v10
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, v1, v9
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v2, v2, v10
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v3, v3, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v8, v11, v11
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v4, v4, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v9, v12, v12
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v5, v5, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v10, v13, v13
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v6, v6, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v11, v14, v14
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v7, v7, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_max_f16_e32 v12, v15, v15
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v3, v3, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v4, v4, v9
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v5, v5, v10
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v6, v6, v11
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v7, v7, v12
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v2, v4, v5
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v3, v6, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v2.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v3.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v4.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v5.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v6.l, v6.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v7.l, v7.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_num_f16_e32 v8, v8, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v2, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v3, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v5, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v6, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v7, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v0.h, v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v1.h, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v2.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.h, v2.h, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v1.h, v3.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.l, v3.h, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v2.h, v4.l, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.l, v4.h, v4.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v3.h, v5.l, v5.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v4.l, v5.h, v5.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v4.h, v6.l, v6.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v5.l, v6.h, v6.h
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v5.h, v7.l, v7.l
+; GFX12-GISEL-TRUE16-NEXT:    v_max_num_f16_e32 v6.l, v7.h, v7.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v1.h, v1.h, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v2.l, v2.h, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v2.h, v3.h, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v3.l, v4.h, v5.l
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v3.h, v5.h, v6.l
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v1.h, v3.l, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmin_v16half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v0, v0, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v8, v8, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v1, v1, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v9, v9, v9
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v2, v2, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v10, v10, v10
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v1, v1, v9
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v2, v2, v10
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v3, v3, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v8, v11, v11
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v4, v4, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v9, v12, v12
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v5, v5, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v10, v13, v13
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v6, v6, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v11, v14, v14
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v7, v7, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_max_num_f16_e32 v12, v15, v15
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v3, v3, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v4, v4, v9
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v5, v5, v10
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v6, v6, v11
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v7, v7, v12
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v2, v4, v5
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v3, v6, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_min_num_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmin.v16half(<16 x half> %v)
   ret half %res
@@ -3191,6 +3579,8 @@ declare double @llvm.vector.reduce.fmin.v8double(<8 x double>)
 declare double @llvm.vector.reduce.fmin.v16double(<16 x double>)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10: {{.*}}
+; GFX11: {{.*}}
+; GFX12: {{.*}}
 ; GFX7: {{.*}}
 ; GFX8: {{.*}}
 ; GFX9: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
index 9ca391682459a..2f628b7cdb281 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
@@ -3,9 +3,12 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define half @test_vector_reduce_fminimum_v2half(<2 x half> %v) {
 ; GFX7-LABEL: test_vector_reduce_fminimum_v2half:
@@ -49,27 +52,68 @@ define half @test_vector_reduce_fminimum_v2half(<2 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fminimum_v2half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fminimum_v2half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v2half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v2half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v2half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v2half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fminimum_v2half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fminimum_v2half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fminimum.v2half(<2 x half> %v)
   ret half %res
@@ -132,49 +176,89 @@ define half @test_vector_reduce_fminimum_v3half(<3 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fminimum_v3half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v3, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fminimum_v3half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0x7c00
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fminimum_v3half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v3half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v3half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v3, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v3half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v3half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x7c00
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fminimum_v3half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fminimum_v3half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fminimum.v3half(<3 x half> %v)
   ret half %res
@@ -253,53 +337,95 @@ define half @test_vector_reduce_fminimum_v4half(<4 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fminimum_v4half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v3, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fminimum_v4half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fminimum_v4half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v1, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v4half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v4half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v3, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v4half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v4half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fminimum_v4half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fminimum_v4half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v1, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fminimum.v4half(<4 x half> %v)
   ret half %res
@@ -440,82 +566,147 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fminimum_v8half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v5, v0, v4
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v4, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v5
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v4
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fminimum_v8half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fminimum_v8half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v4
-; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v2, v2, v6
-; GFX12-GISEL-NEXT:    v_minimum_f16 v3, v3, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v2, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v8half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v8half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v5, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v4, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v8half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v8half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fminimum_v8half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.h, v3.l, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fminimum_v8half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v1, v5
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v2, v2, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v3, v3, v7
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fminimum.v8half(<8 x half> %v)
   ret half %res
@@ -780,133 +971,243 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fminimum_v16half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v9, v0, v8
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v8, v0, v1
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v9
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v8
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v4
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v5
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v6
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v7
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v1, v0, v2
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fminimum_v16half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fminimum_v16half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v1, v9
-; GFX12-GISEL-NEXT:    v_minimum_f16 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_minimum_f16 v3, v3, v11
-; GFX12-GISEL-NEXT:    v_minimum_f16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_minimum_f16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_minimum_f16 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_minimum_f16 v7, v7, v15
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_minimum_f16 v2, v4, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v3, v6, v7
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_minimum_f16 v1, v2, v3
-; GFX12-GISEL-NEXT:    v_minimum_f16 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v16half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v16half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v9, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v9, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v8, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v8, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v1, vcc_lo
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fminimum_v16half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fminimum_v16half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fminimum_v16half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.h, v3.l, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v2.l, v4.l, v4.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v2.h, v5.l, v5.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v3.l, v6.l, v6.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v3.h, v7.l, v7.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.l, v2.l, v2.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v1.h, v3.l, v3.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.h, v1.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fminimum_v16half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v1, v9
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v2, v2, v10
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v3, v3, v11
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v4, v4, v12
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v5, v5, v13
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v6, v6, v14
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v7, v7, v15
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v2, v4, v5
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v3, v6, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v1, v2, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_minimum_f16 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fminimum.v16half(<16 x half> %v)
   ret half %res

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
index d40625ac7353f..657fe0f0804f3 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
@@ -7,10 +7,14 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v2half:
@@ -60,27 +64,91 @@ define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) {
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmul_v2half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmul_v2half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v2half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmul.v2half(half %sp, <2 x half> %v)
   ret half %res
@@ -144,53 +212,97 @@ define half @test_vector_reduce_fmul_v3half(half %sp, <3 x half> %v) {
 ; GFX10-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_fmul_v3half:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: test_vector_reduce_fmul_v3half:
-; GFX11-GISEL:       ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: test_vector_reduce_fmul_v3half:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_vector_reduce_fmul_v3half:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX12-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v3half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmul.v3half(half %sp, <3 x half> %v)
   ret half %res
@@ -264,35 +376,121 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) {
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmul_v4half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmul_v4half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v4half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmul.v4half(half %sp, <4 x half> %v)
   ret half %res
@@ -406,51 +604,181 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) {
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmul_v8half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmul_v8half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v8half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmul.v8half(half %sp, <8 x half> %v)
   ret half %res
@@ -644,83 +972,301 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) {
 ; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_fmul_v16half:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: test_vector_reduce_fmul_v16half:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v6
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v7
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v8
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v7.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v8.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v6
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX11-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v7.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v8.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX11-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v5
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v6
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v7
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v8
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v6.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v7.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v8.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v6
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX12-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v4.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v5.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v6.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v7.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v8.l
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_vector_reduce_fmul_v16half:
+; GFX12-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v2
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v3
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v4
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v5
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v6
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v7
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v8
+; GFX12-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call half @llvm.vector.reduce.fmul.v16half(half %sp, <16 x half> %v)
   ret half %res

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index e035256694ad5..98919f565d902 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_mul_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_mul_v2i8:
@@ -71,18 +76,31 @@ define i8 @test_vector_reduce_mul_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -90,22 +108,39 @@ define i8 @test_vector_reduce_mul_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -178,13 +213,21 @@ define i8 @test_vector_reduce_mul_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v1, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v1, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -194,17 +237,29 @@ define i8 @test_vector_reduce_mul_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v1, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v1, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -295,15 +350,25 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -315,19 +380,33 @@ define i8 @test_vector_reduce_mul_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -452,20 +531,35 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v2.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.h, v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -482,24 +576,43 @@ define i8 @test_vector_reduce_mul_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v2.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.h, v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -693,29 +806,53 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v6, v6, v14
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v2, v2, v10
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v3, v3, v11
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v4, v4, v12
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.h, v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v5.l, v7.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v6.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v2.h, v3.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v3.l, v4.l, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.h, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v6, v6, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v10
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v3, v3, v11
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -742,33 +879,61 @@ define i8 @test_vector_reduce_mul_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v6, v6, v14
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v2, v2, v10
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v3, v3, v11
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v4, v4, v12
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.h, v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v5.l, v7.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v6.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v2.h, v3.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v3.l, v4.l, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.h, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v2.l, v2.h, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v1.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v6, v6, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v10
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v3, v3, v11
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -856,13 +1021,22 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -872,17 +1046,30 @@ define i16 @test_vector_reduce_mul_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -964,16 +1151,25 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v3i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_perm_b32 v1, 1, v1, 0x5040100
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v1, 1, v1, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -984,20 +1180,33 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v3i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_perm_b32 v1, 1, v1, 0x5040100
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v1, 1, v1, 0x5040100
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v3i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1085,14 +1294,25 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v4i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1103,18 +1323,33 @@ define i16 @test_vector_reduce_mul_v4i16(<4 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v4i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v4i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1228,17 +1463,30 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1252,21 +1500,38 @@ define i16 @test_vector_reduce_mul_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1431,22 +1696,40 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_mul_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1465,26 +1748,48 @@ define i16 @test_vector_reduce_mul_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_mul_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -4029,3 +4334,6 @@ declare i64 @llvm.vector.reduce.mul.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
index 46b6e0079a99c..bdb1c22ce7267 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_or_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_or_v2i8:
@@ -70,18 +75,31 @@ define i8 @test_vector_reduce_or_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -89,22 +107,39 @@ define i8 @test_vector_reduce_or_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -175,13 +210,21 @@ define i8 @test_vector_reduce_or_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -189,17 +232,29 @@ define i8 @test_vector_reduce_or_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -300,15 +355,25 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, 0xff00, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -323,19 +388,33 @@ define i8 @test_vector_reduce_or_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, 0xff00, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -473,20 +552,35 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v2.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v1.h, v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, 0xff00, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -504,24 +598,43 @@ define i8 @test_vector_reduce_or_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v2.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v1.h, v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, 0xff00, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -720,29 +833,53 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v5.l, v7.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v2.h, v3.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, 0xff00, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -766,33 +903,61 @@ define i8 @test_vector_reduce_or_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, 0xff0000, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v6, v6, v14
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v2, v2, v10
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v3, v3, v11
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v4, v4, v12
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v1.h, v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v5.l, v7.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v6.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v2.l, v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v2.h, v3.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v3.l, v4.l, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.h, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v1.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, 0xff00, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v6, v6, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v10
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v11
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0xffffff00, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -878,13 +1043,22 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -894,17 +1068,30 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -963,25 +1150,67 @@ define i16 @test_vector_reduce_or_v3i16(<3 x i16> %v) {
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_or_v3i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_vector_reduce_or_v3i16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: test_vector_reduce_or_v3i16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_vector_reduce_or_v3i16:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.or.v3i16(<3 x i16> %v)
   ret i16 %res
@@ -1043,27 +1272,73 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) {
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_or_v4i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_vector_reduce_or_v4i16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: test_vector_reduce_or_v4i16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_vector_reduce_or_v4i16:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v)
   ret i16 %res
@@ -1157,16 +1432,28 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1180,20 +1467,36 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1349,19 +1652,34 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_or_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or3_b32 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_or_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1380,23 +1698,42 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_or_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or3_b32 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_or3_b32 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_or_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or3_b32 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_or3_b32 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_or_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -2826,3 +3163,6 @@ declare i64 @llvm.vector.reduce.or.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index 05d826872da34..c7f9ec8632998 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i8:
@@ -65,14 +70,23 @@ define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -83,18 +97,31 @@ define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -189,16 +216,27 @@ define i8 @test_vector_reduce_smax_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_max3_i16 v0, v1, v0, 0xff80
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v0.l, 0xff80
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v1, v0, 0xff80
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -211,20 +249,35 @@ define i8 @test_vector_reduce_smax_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v2
-; GFX12-SDAG-NEXT:    v_max3_i16 v0, v1, v0, 0xff80
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v0.l, 0xff80
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v1, v0, 0xff80
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -395,22 +448,41 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_i16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max3_i16 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v0.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -446,26 +518,49 @@ define i8 @test_vector_reduce_smax_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max3_i16 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v0.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -709,30 +804,66 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-NEXT:    v_max_i16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_i16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max_i16 v2, v2, v3
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_i16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v5.l, v3.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -776,34 +907,74 @@ define i8 @test_vector_reduce_smax_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-NEXT:    v_max_i16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_i16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max_i16 v2, v2, v3
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_i16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v5.l, v3.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v2, v2, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1153,44 +1324,97 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v15, v15, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v11, v11, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v9, v9, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v13, v13, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_max_i16 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX11-SDAG-NEXT:    v_max_i16 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_max_i16 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_max3_i16 v3, v3, v11, v7
-; GFX11-SDAG-NEXT:    v_bfe_i32 v7, v14, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v8, v8, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-NEXT:    v_max3_i16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    v_max_i16 v5, v6, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_max_i16 v3, v4, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_max3_i16 v2, v2, v10, v5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_max3_i16 v0, v0, v3, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v5.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v3.h, v9.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.l, v13.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.h, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v15, v15, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v13, v13, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v3, v3, v11, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v8, v8, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v5, v6, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v3, v4, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v2, v2, v10, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v0, v3, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1249,48 +1473,105 @@ define i8 @test_vector_reduce_smax_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v15, v15, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v11, v11, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v9, v9, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v13, v13, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_max_i16 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX12-SDAG-NEXT:    v_max_i16 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_max_i16 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_max3_i16 v3, v3, v11, v7
-; GFX12-SDAG-NEXT:    v_bfe_i32 v7, v14, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v8, v8, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-NEXT:    v_max3_i16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    v_max_i16 v5, v6, v7
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_max_i16 v3, v4, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_max3_i16 v2, v2, v10, v5
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_max3_i16 v0, v0, v3, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v5.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v3.h, v9.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.l, v13.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v1.l, v3.h, v1.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.h, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v1.h, v4.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v15, v15, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v13, v13, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v3, v3, v11, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v8, v8, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v5, v6, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v3, v4, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v2, v2, v10, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_i16 v0, v0, v3, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1421,13 +1702,19 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1437,17 +1724,27 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1532,17 +1829,26 @@ define i16 @test_vector_reduce_smax_v3i16(<3 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_max_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v3i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v3i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1553,22 +1859,35 @@ define i16 @test_vector_reduce_smax_v3i16(<3 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_max_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v3i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x8000
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v3i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1668,14 +1987,22 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v4i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1686,18 +2013,30 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v4i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v4i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1830,17 +2169,27 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1854,21 +2203,35 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -2066,22 +2429,37 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smax_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v1, v1, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smax_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -2100,26 +2478,45 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smax_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smax_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smax_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smax_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -4211,3 +4608,6 @@ declare i64 @llvm.vector.reduce.smax.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index db92e3b401340..f7ad431cae1e3 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i8:
@@ -65,14 +70,23 @@ define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -83,18 +97,31 @@ define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -189,16 +216,27 @@ define i8 @test_vector_reduce_smin_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_min3_i16 v0, v1, v0, 0x7f
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v0.l, 0x7f
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v1, v0, 0x7f
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -211,20 +249,35 @@ define i8 @test_vector_reduce_smin_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v2
-; GFX12-SDAG-NEXT:    v_min3_i16 v0, v1, v0, 0x7f
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v0.l, 0x7f
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v1, v0, 0x7f
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -395,22 +448,41 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_i16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min3_i16 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v0.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -446,26 +518,49 @@ define i8 @test_vector_reduce_smin_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min3_i16 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v0.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -709,30 +804,66 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-NEXT:    v_min_i16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_i16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min_i16 v2, v2, v3
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_i16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v5.l, v3.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -776,34 +907,74 @@ define i8 @test_vector_reduce_smin_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-NEXT:    v_min_i16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_i16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v6, 0, 8
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min_i16 v2, v2, v3
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_i16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v5.l, v3.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.l, v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v6, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v2, v2, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1153,44 +1324,97 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v15, v15, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v11, v11, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v9, v9, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v13, v13, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_min_i16 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX11-SDAG-NEXT:    v_min_i16 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_min_i16 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_min3_i16 v3, v3, v11, v7
-; GFX11-SDAG-NEXT:    v_bfe_i32 v7, v14, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v8, v8, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-SDAG-NEXT:    v_min3_i16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX11-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX11-SDAG-NEXT:    v_min_i16 v5, v6, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_min_i16 v3, v4, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_min3_i16 v2, v2, v10, v5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_min3_i16 v0, v0, v3, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v5.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v3.h, v9.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.l, v13.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.h, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v4.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v15, v15, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v13, v13, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v3, v3, v11, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v8, v8, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v5, v6, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v3, v4, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v2, v2, v10, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v0, v3, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1249,48 +1473,105 @@ define i8 @test_vector_reduce_smin_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_bfe_i32 v15, v15, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v11, v11, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v3, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v9, v9, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v13, v13, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_min_i16 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_bfe_i32 v6, v6, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v10, v10, 0, 8
-; GFX12-SDAG-NEXT:    v_min_i16 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_min_i16 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_min3_i16 v3, v3, v11, v7
-; GFX12-SDAG-NEXT:    v_bfe_i32 v7, v14, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v2, v2, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v8, v8, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX12-SDAG-NEXT:    v_min3_i16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_bfe_i32 v3, v12, 0, 8
-; GFX12-SDAG-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX12-SDAG-NEXT:    v_min_i16 v5, v6, v7
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_min_i16 v3, v4, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_min3_i16 v2, v2, v10, v5
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_min3_i16 v0, v0, v3, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v16, v11, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v17, v3, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v15, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v11, v9, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v9, v13, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v13, v1, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.l, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v17.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v16.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v5.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v3.h, v9.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v6, v8, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.l, v13.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v8, v0, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v14, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v1.l, v3.h, v1.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v9.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.h, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v1.h, v4.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.h, v1.l, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_i16 v0.l, v1.h, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v15, v15, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v11, v11, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v9, v9, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v13, v13, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v10, v10, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v3, v3, v11, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v7, v14, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v8, v8, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v3, v12, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v5, v6, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v3, v4, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v2, v2, v10, v5
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_i16 v0, v0, v3, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1421,13 +1702,19 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1437,17 +1724,27 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1532,17 +1829,26 @@ define i16 @test_vector_reduce_smin_v3i16(<3 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_min_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v3i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v3i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1553,22 +1859,35 @@ define i16 @test_vector_reduce_smin_v3i16(<3 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_min_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v3i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_movk_i32 s0, 0x7fff
-; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7fff
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v1, s0, v1, 0x5040100
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v3i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1668,14 +1987,22 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v4i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1686,18 +2013,30 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v4i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v4i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1830,17 +2169,27 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1854,21 +2203,35 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -2066,22 +2429,37 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_smin_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v1, v1, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_smin_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -2100,26 +2478,45 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_i16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_smin_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_i16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_smin_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_i16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_smin_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_i16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_smin_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -4211,3 +4608,6 @@ declare i64 @llvm.vector.reduce.smin.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index 57e24d4e431aa..2eeedd4cfffba 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_umax_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i8:
@@ -57,14 +62,23 @@ define i8 @test_vector_reduce_umax_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -75,18 +89,31 @@ define i8 @test_vector_reduce_umax_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -174,15 +201,25 @@ define i8 @test_vector_reduce_umax_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max3_u16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v1.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -195,19 +232,33 @@ define i8 @test_vector_reduce_umax_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max3_u16 v0, v1, v0, v2
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v1.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v1, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -367,22 +418,39 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max3_u16 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -417,26 +485,47 @@ define i8 @test_vector_reduce_umax_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max3_u16 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v0.l, v0.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -664,30 +753,56 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-NEXT:    v_max_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_u16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max_u16 v2, v2, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_u16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v1.h, v3.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.h, v1.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -730,34 +845,64 @@ define i8 @test_vector_reduce_umax_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-NEXT:    v_max_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_u16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max_u16 v2, v2, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_u16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v1.h, v3.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.h, v1.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v0.l, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v2, v2, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1089,46 +1234,89 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_max_u16 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-NEXT:    v_max_u16 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_max_u16 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_max_u16 v6, v6, v14
-; GFX11-SDAG-NEXT:    v_max3_u16 v3, v3, v11, v7
-; GFX11-SDAG-NEXT:    v_max_u16 v4, v4, v12
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_max3_u16 v2, v2, v10, v6
-; GFX11-SDAG-NEXT:    v_max3_u16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_max3_u16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.h, v0.h, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v4.h, v5.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v1.h, v6.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v0.h, v3.l, v3.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v3.l, v4.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v1.h, v2.l, v2.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v6, v6, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v3, v3, v11, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v2, v2, v10, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1187,50 +1375,97 @@ define i8 @test_vector_reduce_umax_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_max_u16 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-NEXT:    v_max_u16 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_max_u16 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_max_u16 v6, v6, v14
-; GFX12-SDAG-NEXT:    v_max3_u16 v3, v3, v11, v7
-; GFX12-SDAG-NEXT:    v_max_u16 v4, v4, v12
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_max3_u16 v2, v2, v10, v6
-; GFX12-SDAG-NEXT:    v_max3_u16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_max3_u16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.h, v0.h, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v4.h, v5.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v1.h, v6.l, v6.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v0.h, v3.l, v3.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v3.l, v4.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v1.h, v2.l, v2.h, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_max3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v6, v6, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v3, v3, v11, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v2, v2, v10, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_max3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1359,13 +1594,19 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1375,17 +1616,27 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1464,16 +1715,25 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v3i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v3i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1484,20 +1744,33 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v3i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v3i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1595,14 +1868,22 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v4i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1613,18 +1894,30 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v4i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v4i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1755,17 +2048,27 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1779,21 +2082,35 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1989,22 +2306,37 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umax_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v1, v1, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umax_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -2023,26 +2355,45 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_max_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umax_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_max_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_max_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umax_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_max_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umax_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -4134,3 +4485,6 @@ declare i64 @llvm.vector.reduce.umax.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index b44ec6a24e49d..6e4a06b3f8f4e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_umin_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i8:
@@ -57,14 +62,23 @@ define i8 @test_vector_reduce_umin_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -75,18 +89,31 @@ define i8 @test_vector_reduce_umin_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -174,15 +201,25 @@ define i8 @test_vector_reduce_umin_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min3_u16 v0, v1, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v1.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v1, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -195,19 +232,33 @@ define i8 @test_vector_reduce_umin_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min3_u16 v0, v1, v0, v2
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v1.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v1, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -316,22 +367,39 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min3_u16 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -348,26 +416,47 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min3_u16 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v0.l, v0.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -528,30 +617,56 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-NEXT:    v_min_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_u16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min_u16 v2, v2, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_u16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v1.h, v3.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.h, v1.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v0.l, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v2, v2, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -577,34 +692,64 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-NEXT:    v_min_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_u16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v6
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min_u16 v2, v2, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_u16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v1.h, v3.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v1.l, v1.l, v3.h, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.h, v1.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v0.l, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v2, v2, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -870,46 +1015,89 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-SDAG-NEXT:    v_min_u16 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-SDAG-NEXT:    v_min_u16 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_min_u16 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_min_u16 v6, v6, v14
-; GFX11-SDAG-NEXT:    v_min3_u16 v3, v3, v11, v7
-; GFX11-SDAG-NEXT:    v_min_u16 v4, v4, v12
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_min3_u16 v2, v2, v10, v6
-; GFX11-SDAG-NEXT:    v_min3_u16 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_min3_u16 v0, v0, v4, v2
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.h, v0.h, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v4.h, v5.l, v4.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v1.h, v6.l, v6.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v0.h, v3.l, v3.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v3.l, v4.l, v5.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v1.h, v2.l, v2.h, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v6, v6, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v3, v3, v11, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v2, v2, v10, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v0, v4, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -952,50 +1140,97 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX12-SDAG-NEXT:    v_min_u16 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX12-SDAG-NEXT:    v_min_u16 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_min_u16 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_min_u16 v6, v6, v14
-; GFX12-SDAG-NEXT:    v_min3_u16 v3, v3, v11, v7
-; GFX12-SDAG-NEXT:    v_min_u16 v4, v4, v12
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v8
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_min3_u16 v2, v2, v10, v6
-; GFX12-SDAG-NEXT:    v_min3_u16 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_min3_u16 v0, v0, v4, v2
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v2.h, 0xff, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.h, 0xff, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v4.h, 0xff, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v5.l, 0xff, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.h, v0.h, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v5.h, 0xff, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v4.h, v5.l, v4.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v1.h, v6.l, v6.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v0.h, v3.l, v3.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v3.l, v4.l, v5.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v1.h, v2.l, v2.h, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v0.h, v1.l, v4.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_min3_u16 v0.l, v0.l, v3.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v6, v6, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v3, v3, v11, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v2, v2, v10, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_min3_u16 v0, v0, v4, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1105,13 +1340,19 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1121,17 +1362,27 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1212,16 +1463,25 @@ define i16 @test_vector_reduce_umin_v3i16(<3 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v3i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_perm_b32 v1, -1, v1, 0x5040100
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, -1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v1, -1, v1, 0x5040100
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v3i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1232,20 +1492,33 @@ define i16 @test_vector_reduce_umin_v3i16(<3 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_min_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v3i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_perm_b32 v1, -1, v1, 0x5040100
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, -1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v1, -1, v1, 0x5040100
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v3i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1340,14 +1613,22 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v4i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v4i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1358,18 +1639,30 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v4i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v4i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1497,17 +1790,27 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1521,21 +1824,35 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1728,22 +2045,37 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_umin_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v1, v1, v5
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_umin_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1762,26 +2094,45 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_pk_min_u16 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_umin_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_pk_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_min_u16 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_umin_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v1, v1, v5
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_min_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_umin_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_pk_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_min_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_umin_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -3873,3 +4224,6 @@ declare i64 @llvm.vector.reduce.umin.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
index 277a63f00c2c6..cf344ea9b92d4 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll
@@ -7,10 +7,15 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
+; FIXME-TRUE16. enable gisel
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define i8 @test_vector_reduce_xor_v2i8(<2 x i8> %v) {
 ; GFX7-SDAG-LABEL: test_vector_reduce_xor_v2i8:
@@ -69,18 +74,31 @@ define i8 @test_vector_reduce_xor_v2i8(<2 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v2i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v2i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -88,22 +106,39 @@ define i8 @test_vector_reduce_xor_v2i8(<2 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v2i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b16 v2, 8, v1
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v2
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.h, 8, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 8, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v2i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -175,13 +210,21 @@ define i8 @test_vector_reduce_xor_v3i8(<3 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_xor3_b32 v0, v0, v1, v2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v3i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v3i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v3i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v3i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -189,17 +232,29 @@ define i8 @test_vector_reduce_xor_v3i8(<3 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_xor3_b32 v0, v0, v1, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v3i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v3i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v3i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v3i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -287,15 +342,25 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v4i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v4i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v4i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v4i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -306,19 +371,33 @@ define i8 @test_vector_reduce_xor_v4i8(<4 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v4i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v4i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v1.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v4i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v4i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -440,20 +519,35 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v8i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v8i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v2.l, v6.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v1.h, v3.l, v7.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v8i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v8i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -468,24 +562,43 @@ define i8 @test_vector_reduce_xor_v8i8(<8 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v8i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v8i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v2.l, v6.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v1.h, v3.l, v7.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v4.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v8i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v8i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -672,29 +785,53 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX10-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v16i8:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v5, v5, v13
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v7, v7, v15
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v6, v6, v14
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v11
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v4, v4, v12
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v16i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v1.h, v5.l, v13.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v9.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v5.l, v7.l, v15.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v6.l, v14.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v2.l, v2.l, v10.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v2.h, v3.l, v11.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v3.l, v4.l, v12.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v8.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v2.l, v2.h, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v16i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v5, v5, v13
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v9
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v7, v7, v15
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v6, v6, v14
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v4, v4, v12
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v16i8:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -715,33 +852,61 @@ define i8 @test_vector_reduce_xor_v16i8(<16 x i8> %v) {
 ; GFX11-GISEL-NEXT:    v_bfe_u32 v0, v0, 0, 8
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v16i8:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v5, v5, v13
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v7, v7, v15
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v6, v6, v14
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v11
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v4, v4, v12
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v8
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v16i8:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v1.h, v5.l, v13.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v9.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v5.l, v7.l, v15.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v6.l, v14.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v2.l, v2.l, v10.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v2.h, v3.l, v11.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v3.l, v4.l, v12.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v8.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v1.h
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.h, v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v2.l, v2.h, v5.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v3.l
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v1.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b16 v0.l, v0.l, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v16i8:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v5, v5, v13
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v9
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v7, v7, v15
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v6, v6, v14
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v11
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v4, v4, v12
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v16i8:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -824,13 +989,22 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v2i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v2i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -840,17 +1014,30 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v2i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_alignbit_b32 v1, s0, v0, 16
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v0, 16
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v2i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -909,25 +1096,67 @@ define i16 @test_vector_reduce_xor_v3i16(<3 x i16> %v) {
 ; GFX10-NEXT:    v_xor3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_xor_v3i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_xor3_b32 v0, v0, v2, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_vector_reduce_xor_v3i16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: test_vector_reduce_xor_v3i16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_xor3_b32 v0, v0, v2, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v3i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v3i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_vector_reduce_xor_v3i16:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.xor.v3i16(<3 x i16> %v)
   ret i16 %res
@@ -989,27 +1218,73 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) {
 ; GFX10-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_vector_reduce_xor_v4i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_vector_reduce_xor_v4i16:
+; GFX11-GISEL:       ; %bb.0: ; %entry
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-LABEL: test_vector_reduce_xor_v4i16:
-; GFX12:       ; %bb.0: ; %entry
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v4i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v4i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: test_vector_reduce_xor_v4i16:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v)
   ret i16 %res
@@ -1104,16 +1379,28 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v8i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor3_b32 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v8i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v8i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v8i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1127,20 +1414,36 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v8i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor3_b32 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v8i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v8i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v8i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -1298,19 +1601,34 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX10-GISEL-NEXT:    v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_vector_reduce_xor_v16i16:
-; GFX11-SDAG:       ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_xor3_b32 v1, v1, v5, v3
-; GFX11-SDAG-NEXT:    v_xor3_b32 v0, v0, v2, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v16i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v16i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v16i16:
 ; GFX11-GISEL:       ; %bb.0: ; %entry
@@ -1329,23 +1647,42 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
 ; GFX11-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: test_vector_reduce_xor_v16i16:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_xor3_b32 v1, v1, v5, v3
-; GFX12-SDAG-NEXT:    v_xor3_b32 v0, v0, v2, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v16i16:
+; GFX12-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX12-SDAG-TRUE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v16i16:
+; GFX12-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_xor3_b32 v1, v1, v5, v3
+; GFX12-SDAG-FAKE16-NEXT:    v_xor3_b32 v0, v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v16i16:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
@@ -2957,3 +3294,6 @@ declare i64 @llvm.vector.reduce.xor.v3i64(<3 x i64>)
 declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
 declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
 declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-GISEL-FAKE16: {{.*}}
+; GFX12-GISEL-FAKE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
index b079a94b5fcc3..587f5d05d358b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_rebroadcast.ll
@@ -1,35 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <2 x i8> @shuffle_v2i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_ushort v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v2i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b16 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v2i8_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b16 v1.l, 8, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v2i8_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b16 v0, 8, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i8> %val0, <2 x i8> poison, <2 x i32> <i32 1, i32 1>
@@ -39,37 +51,49 @@ entry:
 define <4 x i8> @shuffle_v4i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v4i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v4i8_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4i8_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i8> %val0, <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -79,49 +103,65 @@ entry:
 define <8 x i8> @shuffle_v8i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v8i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v8i8_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 8, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v7.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v8i8_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i8> %val0, <8 x i8> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -131,73 +171,97 @@ entry:
 define <16 x i8> @shuffle_v16i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v16i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v16i8_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v15.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v16i8_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i8> %val0, <16 x i8> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -207,121 +271,161 @@ entry:
 define <32 x i8> @shuffle_v32i8_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  v_mov_b32_e32 v16, v0
-; GFX9-NEXT:  v_mov_b32_e32 v17, v0
-; GFX9-NEXT:  v_mov_b32_e32 v18, v0
-; GFX9-NEXT:  v_mov_b32_e32 v19, v0
-; GFX9-NEXT:  v_mov_b32_e32 v20, v0
-; GFX9-NEXT:  v_mov_b32_e32 v21, v0
-; GFX9-NEXT:  v_mov_b32_e32 v22, v0
-; GFX9-NEXT:  v_mov_b32_e32 v23, v0
-; GFX9-NEXT:  v_mov_b32_e32 v24, v0
-; GFX9-NEXT:  v_mov_b32_e32 v25, v0
-; GFX9-NEXT:  v_mov_b32_e32 v26, v0
-; GFX9-NEXT:  v_mov_b32_e32 v27, v0
-; GFX9-NEXT:  v_mov_b32_e32 v28, v0
-; GFX9-NEXT:  v_mov_b32_e32 v29, v0
-; GFX9-NEXT:  v_mov_b32_e32 v30, v0
-; GFX9-NEXT:  v_mov_b32_e32 v31, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-NEXT:    v_mov_b32_e32 v19, v0
+; GFX9-NEXT:    v_mov_b32_e32 v20, v0
+; GFX9-NEXT:    v_mov_b32_e32 v21, v0
+; GFX9-NEXT:    v_mov_b32_e32 v22, v0
+; GFX9-NEXT:    v_mov_b32_e32 v23, v0
+; GFX9-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-NEXT:    v_mov_b32_e32 v25, v0
+; GFX9-NEXT:    v_mov_b32_e32 v26, v0
+; GFX9-NEXT:    v_mov_b32_e32 v27, v0
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    v_mov_b32_e32 v29, v0
+; GFX9-NEXT:    v_mov_b32_e32 v30, v0
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i8_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  v_mov_b32_e32 v16, v0
-; GFX10-NEXT:  v_mov_b32_e32 v17, v0
-; GFX10-NEXT:  v_mov_b32_e32 v18, v0
-; GFX10-NEXT:  v_mov_b32_e32 v19, v0
-; GFX10-NEXT:  v_mov_b32_e32 v20, v0
-; GFX10-NEXT:  v_mov_b32_e32 v21, v0
-; GFX10-NEXT:  v_mov_b32_e32 v22, v0
-; GFX10-NEXT:  v_mov_b32_e32 v23, v0
-; GFX10-NEXT:  v_mov_b32_e32 v24, v0
-; GFX10-NEXT:  v_mov_b32_e32 v25, v0
-; GFX10-NEXT:  v_mov_b32_e32 v26, v0
-; GFX10-NEXT:  v_mov_b32_e32 v27, v0
-; GFX10-NEXT:  v_mov_b32_e32 v28, v0
-; GFX10-NEXT:  v_mov_b32_e32 v29, v0
-; GFX10-NEXT:  v_mov_b32_e32 v30, v0
-; GFX10-NEXT:  v_mov_b32_e32 v31, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v32i8_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_lshrrev_b32_e32 v0, 8, v0
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  v_mov_b32_e32 v16, v0
-; GFX11-NEXT:  v_mov_b32_e32 v17, v0
-; GFX11-NEXT:  v_mov_b32_e32 v18, v0
-; GFX11-NEXT:  v_mov_b32_e32 v19, v0
-; GFX11-NEXT:  v_mov_b32_e32 v20, v0
-; GFX11-NEXT:  v_mov_b32_e32 v21, v0
-; GFX11-NEXT:  v_mov_b32_e32 v22, v0
-; GFX11-NEXT:  v_mov_b32_e32 v23, v0
-; GFX11-NEXT:  v_mov_b32_e32 v24, v0
-; GFX11-NEXT:  v_mov_b32_e32 v25, v0
-; GFX11-NEXT:  v_mov_b32_e32 v26, v0
-; GFX11-NEXT:  v_mov_b32_e32 v27, v0
-; GFX11-NEXT:  v_mov_b32_e32 v28, v0
-; GFX11-NEXT:  v_mov_b32_e32 v29, v0
-; GFX11-NEXT:  v_mov_b32_e32 v30, v0
-; GFX11-NEXT:  v_mov_b32_e32 v31, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v17, v0
+; GFX10-NEXT:    v_mov_b32_e32 v18, v0
+; GFX10-NEXT:    v_mov_b32_e32 v19, v0
+; GFX10-NEXT:    v_mov_b32_e32 v20, v0
+; GFX10-NEXT:    v_mov_b32_e32 v21, v0
+; GFX10-NEXT:    v_mov_b32_e32 v22, v0
+; GFX10-NEXT:    v_mov_b32_e32 v23, v0
+; GFX10-NEXT:    v_mov_b32_e32 v24, v0
+; GFX10-NEXT:    v_mov_b32_e32 v25, v0
+; GFX10-NEXT:    v_mov_b32_e32 v26, v0
+; GFX10-NEXT:    v_mov_b32_e32 v27, v0
+; GFX10-NEXT:    v_mov_b32_e32 v28, v0
+; GFX10-NEXT:    v_mov_b32_e32 v29, v0
+; GFX10-NEXT:    v_mov_b32_e32 v30, v0
+; GFX10-NEXT:    v_mov_b32_e32 v31, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v32i8_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 8, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v31.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v31.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v32i8_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v16, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v17, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v18, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v19, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v20, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v21, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v22, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v23, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v24, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v25, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v26, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v27, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v28, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v29, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v30, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i8>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i8> %val0, <32 x i8> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -331,28 +435,39 @@ entry:
 define <2 x i16> @shuffle_v2i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v2i16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v2i16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v2i16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i16> %val0, <2 x i16> poison, <2 x i32> <i32 1, i32 1>
@@ -362,32 +477,44 @@ entry:
 define <4 x i16> @shuffle_v4i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v4i16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v4i16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4i16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i16> %val0, <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -397,38 +524,52 @@ entry:
 define <8 x i16> @shuffle_v8i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v8i16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v8i16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v8i16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i16> %val0, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -438,50 +579,68 @@ entry:
 define <16 x i16> @shuffle_v16i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v16i16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v16i16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v16i16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i16> %val0, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -491,74 +650,100 @@ entry:
 define <32 x i16> @shuffle_v32i16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v32i16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v32i16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v32i16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i16>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i16> %val0, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -568,27 +753,27 @@ entry:
 define <2 x i32> @shuffle_v2i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x i32> %val0, <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -598,33 +783,33 @@ entry:
 define <4 x i32> @shuffle_v4i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x i32> %val0, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -634,45 +819,45 @@ entry:
 define <8 x i32> @shuffle_v8i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x i32> %val0, <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -682,69 +867,69 @@ entry:
 define <16 x i32> @shuffle_v16i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x i32> %val0, <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -754,117 +939,117 @@ entry:
 define <32 x i32> @shuffle_v32i32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  v_mov_b32_e32 v16, v0
-; GFX9-NEXT:  v_mov_b32_e32 v17, v0
-; GFX9-NEXT:  v_mov_b32_e32 v18, v0
-; GFX9-NEXT:  v_mov_b32_e32 v19, v0
-; GFX9-NEXT:  v_mov_b32_e32 v20, v0
-; GFX9-NEXT:  v_mov_b32_e32 v21, v0
-; GFX9-NEXT:  v_mov_b32_e32 v22, v0
-; GFX9-NEXT:  v_mov_b32_e32 v23, v0
-; GFX9-NEXT:  v_mov_b32_e32 v24, v0
-; GFX9-NEXT:  v_mov_b32_e32 v25, v0
-; GFX9-NEXT:  v_mov_b32_e32 v26, v0
-; GFX9-NEXT:  v_mov_b32_e32 v27, v0
-; GFX9-NEXT:  v_mov_b32_e32 v28, v0
-; GFX9-NEXT:  v_mov_b32_e32 v29, v0
-; GFX9-NEXT:  v_mov_b32_e32 v30, v0
-; GFX9-NEXT:  v_mov_b32_e32 v31, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    v_mov_b32_e32 v17, v0
+; GFX9-NEXT:    v_mov_b32_e32 v18, v0
+; GFX9-NEXT:    v_mov_b32_e32 v19, v0
+; GFX9-NEXT:    v_mov_b32_e32 v20, v0
+; GFX9-NEXT:    v_mov_b32_e32 v21, v0
+; GFX9-NEXT:    v_mov_b32_e32 v22, v0
+; GFX9-NEXT:    v_mov_b32_e32 v23, v0
+; GFX9-NEXT:    v_mov_b32_e32 v24, v0
+; GFX9-NEXT:    v_mov_b32_e32 v25, v0
+; GFX9-NEXT:    v_mov_b32_e32 v26, v0
+; GFX9-NEXT:    v_mov_b32_e32 v27, v0
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    v_mov_b32_e32 v29, v0
+; GFX9-NEXT:    v_mov_b32_e32 v30, v0
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off offset:4
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  v_mov_b32_e32 v16, v0
-; GFX10-NEXT:  v_mov_b32_e32 v17, v0
-; GFX10-NEXT:  v_mov_b32_e32 v18, v0
-; GFX10-NEXT:  v_mov_b32_e32 v19, v0
-; GFX10-NEXT:  v_mov_b32_e32 v20, v0
-; GFX10-NEXT:  v_mov_b32_e32 v21, v0
-; GFX10-NEXT:  v_mov_b32_e32 v22, v0
-; GFX10-NEXT:  v_mov_b32_e32 v23, v0
-; GFX10-NEXT:  v_mov_b32_e32 v24, v0
-; GFX10-NEXT:  v_mov_b32_e32 v25, v0
-; GFX10-NEXT:  v_mov_b32_e32 v26, v0
-; GFX10-NEXT:  v_mov_b32_e32 v27, v0
-; GFX10-NEXT:  v_mov_b32_e32 v28, v0
-; GFX10-NEXT:  v_mov_b32_e32 v29, v0
-; GFX10-NEXT:  v_mov_b32_e32 v30, v0
-; GFX10-NEXT:  v_mov_b32_e32 v31, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    v_mov_b32_e32 v17, v0
+; GFX10-NEXT:    v_mov_b32_e32 v18, v0
+; GFX10-NEXT:    v_mov_b32_e32 v19, v0
+; GFX10-NEXT:    v_mov_b32_e32 v20, v0
+; GFX10-NEXT:    v_mov_b32_e32 v21, v0
+; GFX10-NEXT:    v_mov_b32_e32 v22, v0
+; GFX10-NEXT:    v_mov_b32_e32 v23, v0
+; GFX10-NEXT:    v_mov_b32_e32 v24, v0
+; GFX10-NEXT:    v_mov_b32_e32 v25, v0
+; GFX10-NEXT:    v_mov_b32_e32 v26, v0
+; GFX10-NEXT:    v_mov_b32_e32 v27, v0
+; GFX10-NEXT:    v_mov_b32_e32 v28, v0
+; GFX10-NEXT:    v_mov_b32_e32 v29, v0
+; GFX10-NEXT:    v_mov_b32_e32 v30, v0
+; GFX10-NEXT:    v_mov_b32_e32 v31, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32i32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  v_mov_b32_e32 v16, v0
-; GFX11-NEXT:  v_mov_b32_e32 v17, v0
-; GFX11-NEXT:  v_mov_b32_e32 v18, v0
-; GFX11-NEXT:  v_mov_b32_e32 v19, v0
-; GFX11-NEXT:  v_mov_b32_e32 v20, v0
-; GFX11-NEXT:  v_mov_b32_e32 v21, v0
-; GFX11-NEXT:  v_mov_b32_e32 v22, v0
-; GFX11-NEXT:  v_mov_b32_e32 v23, v0
-; GFX11-NEXT:  v_mov_b32_e32 v24, v0
-; GFX11-NEXT:  v_mov_b32_e32 v25, v0
-; GFX11-NEXT:  v_mov_b32_e32 v26, v0
-; GFX11-NEXT:  v_mov_b32_e32 v27, v0
-; GFX11-NEXT:  v_mov_b32_e32 v28, v0
-; GFX11-NEXT:  v_mov_b32_e32 v29, v0
-; GFX11-NEXT:  v_mov_b32_e32 v30, v0
-; GFX11-NEXT:  v_mov_b32_e32 v31, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-NEXT:    v_mov_b32_e32 v16, v0
+; GFX11-NEXT:    v_mov_b32_e32 v17, v0
+; GFX11-NEXT:    v_mov_b32_e32 v18, v0
+; GFX11-NEXT:    v_mov_b32_e32 v19, v0
+; GFX11-NEXT:    v_mov_b32_e32 v20, v0
+; GFX11-NEXT:    v_mov_b32_e32 v21, v0
+; GFX11-NEXT:    v_mov_b32_e32 v22, v0
+; GFX11-NEXT:    v_mov_b32_e32 v23, v0
+; GFX11-NEXT:    v_mov_b32_e32 v24, v0
+; GFX11-NEXT:    v_mov_b32_e32 v25, v0
+; GFX11-NEXT:    v_mov_b32_e32 v26, v0
+; GFX11-NEXT:    v_mov_b32_e32 v27, v0
+; GFX11-NEXT:    v_mov_b32_e32 v28, v0
+; GFX11-NEXT:    v_mov_b32_e32 v29, v0
+; GFX11-NEXT:    v_mov_b32_e32 v30, v0
+; GFX11-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x i32>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x i32> %val0, <32 x i32> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -874,28 +1059,38 @@ entry:
 define <2 x bfloat> @shuffle_v2bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v2bf16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v2bf16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v2bf16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x bfloat> %val0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
@@ -905,31 +1100,42 @@ entry:
 define <3 x bfloat> @shuffle_v3bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v1, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
-; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v1, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v3bf16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v3bf16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v3bf16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x bfloat> %val0, <3 x bfloat> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -939,32 +1145,43 @@ entry:
 define <4 x bfloat> @shuffle_v4bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v4bf16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x bfloat> %val0, <4 x bfloat> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -974,35 +1191,47 @@ entry:
 define <6 x bfloat> @shuffle_v6bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v6bf16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v6bf16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v6bf16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x bfloat> %val0, <6 x bfloat> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1012,38 +1241,51 @@ entry:
 define <8 x bfloat> @shuffle_v8bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v8bf16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v8bf16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v8bf16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x bfloat> %val0, <8 x bfloat> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1053,50 +1295,67 @@ entry:
 define <16 x bfloat> @shuffle_v16bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v16bf16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v16bf16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v16bf16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x bfloat> %val0, <16 x bfloat> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1106,74 +1365,99 @@ entry:
 define <32 x bfloat> @shuffle_v32bf16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32bf16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v32bf16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v32bf16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v32bf16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x bfloat>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x bfloat> %val0, <32 x bfloat> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1183,28 +1467,38 @@ entry:
 define <2 x half> @shuffle_v2f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v2f16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v2f16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v2f16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x half> %val0, <2 x half> poison, <2 x i32> <i32 1, i32 1>
@@ -1214,31 +1508,42 @@ entry:
 define <3 x half> @shuffle_v3f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v1, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v1, v1, s4
-; GFX9-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v1, v1, s4
+; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v1, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX10-NEXT:  v_alignbit_b32 v1, s4, v1, 16
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v3f16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v1, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:  v_alignbit_b32 v1, s0, v1, 16
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v3f16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v3f16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x half> %val0, <3 x half> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -1248,32 +1553,43 @@ entry:
 define <4 x half> @shuffle_v4f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v4f16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v4f16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x half> %val0, <4 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -1283,35 +1599,47 @@ entry:
 define <6 x half> @shuffle_v6f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v6f16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v6f16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v6f16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x half> %val0, <6 x half> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1321,38 +1649,51 @@ entry:
 define <8 x half> @shuffle_v8f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v8f16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v8f16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v8f16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x half> %val0, <8 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1362,50 +1703,67 @@ entry:
 define <16 x half> @shuffle_v16f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v16f16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v16f16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v16f16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x half> %val0, <16 x half> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1415,74 +1773,99 @@ entry:
 define <32 x half> @shuffle_v32f16_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dword v0, v[0:1], off
-; GFX9-NEXT:  s_mov_b32 s4, 0x7060302
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_perm_b32 v0, v0, v0, s4
-; GFX9-NEXT:  v_mov_b32_e32 v1, v0
-; GFX9-NEXT:  v_mov_b32_e32 v2, v0
-; GFX9-NEXT:  v_mov_b32_e32 v3, v0
-; GFX9-NEXT:  v_mov_b32_e32 v4, v0
-; GFX9-NEXT:  v_mov_b32_e32 v5, v0
-; GFX9-NEXT:  v_mov_b32_e32 v6, v0
-; GFX9-NEXT:  v_mov_b32_e32 v7, v0
-; GFX9-NEXT:  v_mov_b32_e32 v8, v0
-; GFX9-NEXT:  v_mov_b32_e32 v9, v0
-; GFX9-NEXT:  v_mov_b32_e32 v10, v0
-; GFX9-NEXT:  v_mov_b32_e32 v11, v0
-; GFX9-NEXT:  v_mov_b32_e32 v12, v0
-; GFX9-NEXT:  v_mov_b32_e32 v13, v0
-; GFX9-NEXT:  v_mov_b32_e32 v14, v0
-; GFX9-NEXT:  v_mov_b32_e32 v15, v0
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-NEXT:    v_mov_b32_e32 v11, v0
+; GFX9-NEXT:    v_mov_b32_e32 v12, v0
+; GFX9-NEXT:    v_mov_b32_e32 v13, v0
+; GFX9-NEXT:    v_mov_b32_e32 v14, v0
+; GFX9-NEXT:    v_mov_b32_e32 v15, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32f16_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dword v0, v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10-NEXT:  v_mov_b32_e32 v1, v0
-; GFX10-NEXT:  v_mov_b32_e32 v2, v0
-; GFX10-NEXT:  v_mov_b32_e32 v3, v0
-; GFX10-NEXT:  v_mov_b32_e32 v4, v0
-; GFX10-NEXT:  v_mov_b32_e32 v5, v0
-; GFX10-NEXT:  v_mov_b32_e32 v6, v0
-; GFX10-NEXT:  v_mov_b32_e32 v7, v0
-; GFX10-NEXT:  v_mov_b32_e32 v8, v0
-; GFX10-NEXT:  v_mov_b32_e32 v9, v0
-; GFX10-NEXT:  v_mov_b32_e32 v10, v0
-; GFX10-NEXT:  v_mov_b32_e32 v11, v0
-; GFX10-NEXT:  v_mov_b32_e32 v12, v0
-; GFX10-NEXT:  v_mov_b32_e32 v13, v0
-; GFX10-NEXT:  v_mov_b32_e32 v14, v0
-; GFX10-NEXT:  v_mov_b32_e32 v15, v0
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: shuffle_v32f16_rebroadcast:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_perm_b32 v0, v0, v0, 0x7060302
-; GFX11-NEXT:  s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:  v_mov_b32_e32 v1, v0
-; GFX11-NEXT:  v_mov_b32_e32 v2, v0
-; GFX11-NEXT:  v_mov_b32_e32 v3, v0
-; GFX11-NEXT:  v_mov_b32_e32 v4, v0
-; GFX11-NEXT:  v_mov_b32_e32 v5, v0
-; GFX11-NEXT:  v_mov_b32_e32 v6, v0
-; GFX11-NEXT:  v_mov_b32_e32 v7, v0
-; GFX11-NEXT:  v_mov_b32_e32 v8, v0
-; GFX11-NEXT:  v_mov_b32_e32 v9, v0
-; GFX11-NEXT:  v_mov_b32_e32 v10, v0
-; GFX11-NEXT:  v_mov_b32_e32 v11, v0
-; GFX11-NEXT:  v_mov_b32_e32 v12, v0
-; GFX11-NEXT:  v_mov_b32_e32 v13, v0
-; GFX11-NEXT:  v_mov_b32_e32 v14, v0
-; GFX11-NEXT:  v_mov_b32_e32 v15, v0
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_mov_b32_e32 v7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_mov_b32_e32 v10, v0
+; GFX10-NEXT:    v_mov_b32_e32 v11, v0
+; GFX10-NEXT:    v_mov_b32_e32 v12, v0
+; GFX10-NEXT:    v_mov_b32_e32 v13, v0
+; GFX10-NEXT:    v_mov_b32_e32 v14, v0
+; GFX10-NEXT:    v_mov_b32_e32 v15, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: shuffle_v32f16_rebroadcast:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v32f16_rebroadcast:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v8, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v10, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v12, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v13, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v14, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v15, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x half>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x half> %val0, <32 x half> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1492,27 +1875,27 @@ entry:
 define <2 x float> @shuffle_v2f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v2f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <2 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <2 x float> %val0, <2 x float> poison, <2 x i32> <i32 1, i32 1>
@@ -1522,30 +1905,30 @@ entry:
 define <3 x float> @shuffle_v3f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx3 v[0:2], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v3f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b96 v[0:2], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <3 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <3 x float> %val0, <3 x float> poison, <3 x i32> <i32 1, i32 1, i32 1>
@@ -1555,33 +1938,33 @@ entry:
 define <4 x float> @shuffle_v4f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v4f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <4 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <4 x float> %val0, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -1591,39 +1974,39 @@ entry:
 define <6 x float> @shuffle_v6f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v6f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <6 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <6 x float> %val0, <6 x float> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1633,45 +2016,45 @@ entry:
 define <8 x float> @shuffle_v8f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v8f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <8 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <8 x float> %val0, <8 x float> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1681,69 +2064,69 @@ entry:
 define <16 x float> @shuffle_v16f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  v_mov_b32_e32 v8, v1
-; GFX9-NEXT:  v_mov_b32_e32 v9, v1
-; GFX9-NEXT:  v_mov_b32_e32 v10, v1
-; GFX9-NEXT:  v_mov_b32_e32 v11, v1
-; GFX9-NEXT:  v_mov_b32_e32 v12, v1
-; GFX9-NEXT:  v_mov_b32_e32 v13, v1
-; GFX9-NEXT:  v_mov_b32_e32 v14, v1
-; GFX9-NEXT:  v_mov_b32_e32 v15, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  v_mov_b32_e32 v8, v1
-; GFX10-NEXT:  v_mov_b32_e32 v9, v1
-; GFX10-NEXT:  v_mov_b32_e32 v10, v1
-; GFX10-NEXT:  v_mov_b32_e32 v11, v1
-; GFX10-NEXT:  v_mov_b32_e32 v12, v1
-; GFX10-NEXT:  v_mov_b32_e32 v13, v1
-; GFX10-NEXT:  v_mov_b32_e32 v14, v1
-; GFX10-NEXT:  v_mov_b32_e32 v15, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v1
+; GFX10-NEXT:    v_mov_b32_e32 v12, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v16f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  v_mov_b32_e32 v8, v1
-; GFX11-NEXT:  v_mov_b32_e32 v9, v1
-; GFX11-NEXT:  v_mov_b32_e32 v10, v1
-; GFX11-NEXT:  v_mov_b32_e32 v11, v1
-; GFX11-NEXT:  v_mov_b32_e32 v12, v1
-; GFX11-NEXT:  v_mov_b32_e32 v13, v1
-; GFX11-NEXT:  v_mov_b32_e32 v14, v1
-; GFX11-NEXT:  v_mov_b32_e32 v15, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GFX11-NEXT:    v_mov_b32_e32 v9, v1
+; GFX11-NEXT:    v_mov_b32_e32 v10, v1
+; GFX11-NEXT:    v_mov_b32_e32 v11, v1
+; GFX11-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-NEXT:    v_mov_b32_e32 v13, v1
+; GFX11-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-NEXT:    v_mov_b32_e32 v15, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <16 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <16 x float> %val0, <16 x float> poison, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1753,117 +2136,117 @@ entry:
 define <32 x float> @shuffle_v32f32_rebroadcast(ptr addrspace(1) %arg0) {
 ; GFX9-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:  s_waitcnt vmcnt(0)
-; GFX9-NEXT:  v_mov_b32_e32 v0, v1
-; GFX9-NEXT:  v_mov_b32_e32 v2, v1
-; GFX9-NEXT:  v_mov_b32_e32 v3, v1
-; GFX9-NEXT:  v_mov_b32_e32 v4, v1
-; GFX9-NEXT:  v_mov_b32_e32 v5, v1
-; GFX9-NEXT:  v_mov_b32_e32 v6, v1
-; GFX9-NEXT:  v_mov_b32_e32 v7, v1
-; GFX9-NEXT:  v_mov_b32_e32 v8, v1
-; GFX9-NEXT:  v_mov_b32_e32 v9, v1
-; GFX9-NEXT:  v_mov_b32_e32 v10, v1
-; GFX9-NEXT:  v_mov_b32_e32 v11, v1
-; GFX9-NEXT:  v_mov_b32_e32 v12, v1
-; GFX9-NEXT:  v_mov_b32_e32 v13, v1
-; GFX9-NEXT:  v_mov_b32_e32 v14, v1
-; GFX9-NEXT:  v_mov_b32_e32 v15, v1
-; GFX9-NEXT:  v_mov_b32_e32 v16, v1
-; GFX9-NEXT:  v_mov_b32_e32 v17, v1
-; GFX9-NEXT:  v_mov_b32_e32 v18, v1
-; GFX9-NEXT:  v_mov_b32_e32 v19, v1
-; GFX9-NEXT:  v_mov_b32_e32 v20, v1
-; GFX9-NEXT:  v_mov_b32_e32 v21, v1
-; GFX9-NEXT:  v_mov_b32_e32 v22, v1
-; GFX9-NEXT:  v_mov_b32_e32 v23, v1
-; GFX9-NEXT:  v_mov_b32_e32 v24, v1
-; GFX9-NEXT:  v_mov_b32_e32 v25, v1
-; GFX9-NEXT:  v_mov_b32_e32 v26, v1
-; GFX9-NEXT:  v_mov_b32_e32 v27, v1
-; GFX9-NEXT:  v_mov_b32_e32 v28, v1
-; GFX9-NEXT:  v_mov_b32_e32 v29, v1
-; GFX9-NEXT:  v_mov_b32_e32 v30, v1
-; GFX9-NEXT:  v_mov_b32_e32 v31, v1
-; GFX9-NEXT:  s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, v1
+; GFX9-NEXT:    v_mov_b32_e32 v11, v1
+; GFX9-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-NEXT:    v_mov_b32_e32 v13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v14, v1
+; GFX9-NEXT:    v_mov_b32_e32 v15, v1
+; GFX9-NEXT:    v_mov_b32_e32 v16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-NEXT:    v_mov_b32_e32 v18, v1
+; GFX9-NEXT:    v_mov_b32_e32 v19, v1
+; GFX9-NEXT:    v_mov_b32_e32 v20, v1
+; GFX9-NEXT:    v_mov_b32_e32 v21, v1
+; GFX9-NEXT:    v_mov_b32_e32 v22, v1
+; GFX9-NEXT:    v_mov_b32_e32 v23, v1
+; GFX9-NEXT:    v_mov_b32_e32 v24, v1
+; GFX9-NEXT:    v_mov_b32_e32 v25, v1
+; GFX9-NEXT:    v_mov_b32_e32 v26, v1
+; GFX9-NEXT:    v_mov_b32_e32 v27, v1
+; GFX9-NEXT:    v_mov_b32_e32 v28, v1
+; GFX9-NEXT:    v_mov_b32_e32 v29, v1
+; GFX9-NEXT:    v_mov_b32_e32 v30, v1
+; GFX9-NEXT:    v_mov_b32_e32 v31, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:  global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT:  s_waitcnt vmcnt(0)
-; GFX10-NEXT:  v_mov_b32_e32 v0, v1
-; GFX10-NEXT:  v_mov_b32_e32 v2, v1
-; GFX10-NEXT:  v_mov_b32_e32 v3, v1
-; GFX10-NEXT:  v_mov_b32_e32 v4, v1
-; GFX10-NEXT:  v_mov_b32_e32 v5, v1
-; GFX10-NEXT:  v_mov_b32_e32 v6, v1
-; GFX10-NEXT:  v_mov_b32_e32 v7, v1
-; GFX10-NEXT:  v_mov_b32_e32 v8, v1
-; GFX10-NEXT:  v_mov_b32_e32 v9, v1
-; GFX10-NEXT:  v_mov_b32_e32 v10, v1
-; GFX10-NEXT:  v_mov_b32_e32 v11, v1
-; GFX10-NEXT:  v_mov_b32_e32 v12, v1
-; GFX10-NEXT:  v_mov_b32_e32 v13, v1
-; GFX10-NEXT:  v_mov_b32_e32 v14, v1
-; GFX10-NEXT:  v_mov_b32_e32 v15, v1
-; GFX10-NEXT:  v_mov_b32_e32 v16, v1
-; GFX10-NEXT:  v_mov_b32_e32 v17, v1
-; GFX10-NEXT:  v_mov_b32_e32 v18, v1
-; GFX10-NEXT:  v_mov_b32_e32 v19, v1
-; GFX10-NEXT:  v_mov_b32_e32 v20, v1
-; GFX10-NEXT:  v_mov_b32_e32 v21, v1
-; GFX10-NEXT:  v_mov_b32_e32 v22, v1
-; GFX10-NEXT:  v_mov_b32_e32 v23, v1
-; GFX10-NEXT:  v_mov_b32_e32 v24, v1
-; GFX10-NEXT:  v_mov_b32_e32 v25, v1
-; GFX10-NEXT:  v_mov_b32_e32 v26, v1
-; GFX10-NEXT:  v_mov_b32_e32 v27, v1
-; GFX10-NEXT:  v_mov_b32_e32 v28, v1
-; GFX10-NEXT:  v_mov_b32_e32 v29, v1
-; GFX10-NEXT:  v_mov_b32_e32 v30, v1
-; GFX10-NEXT:  v_mov_b32_e32 v31, v1
-; GFX10-NEXT:  s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, v1
+; GFX10-NEXT:    v_mov_b32_e32 v3, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v7, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v10, v1
+; GFX10-NEXT:    v_mov_b32_e32 v11, v1
+; GFX10-NEXT:    v_mov_b32_e32 v12, v1
+; GFX10-NEXT:    v_mov_b32_e32 v13, v1
+; GFX10-NEXT:    v_mov_b32_e32 v14, v1
+; GFX10-NEXT:    v_mov_b32_e32 v15, v1
+; GFX10-NEXT:    v_mov_b32_e32 v16, v1
+; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v18, v1
+; GFX10-NEXT:    v_mov_b32_e32 v19, v1
+; GFX10-NEXT:    v_mov_b32_e32 v20, v1
+; GFX10-NEXT:    v_mov_b32_e32 v21, v1
+; GFX10-NEXT:    v_mov_b32_e32 v22, v1
+; GFX10-NEXT:    v_mov_b32_e32 v23, v1
+; GFX10-NEXT:    v_mov_b32_e32 v24, v1
+; GFX10-NEXT:    v_mov_b32_e32 v25, v1
+; GFX10-NEXT:    v_mov_b32_e32 v26, v1
+; GFX10-NEXT:    v_mov_b32_e32 v27, v1
+; GFX10-NEXT:    v_mov_b32_e32 v28, v1
+; GFX10-NEXT:    v_mov_b32_e32 v29, v1
+; GFX10-NEXT:    v_mov_b32_e32 v30, v1
+; GFX10-NEXT:    v_mov_b32_e32 v31, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: shuffle_v32f32_rebroadcast:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:  s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:  global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT:  s_waitcnt vmcnt(0)
-; GFX11-NEXT:  v_mov_b32_e32 v0, v1
-; GFX11-NEXT:  v_mov_b32_e32 v2, v1
-; GFX11-NEXT:  v_mov_b32_e32 v3, v1
-; GFX11-NEXT:  v_mov_b32_e32 v4, v1
-; GFX11-NEXT:  v_mov_b32_e32 v5, v1
-; GFX11-NEXT:  v_mov_b32_e32 v6, v1
-; GFX11-NEXT:  v_mov_b32_e32 v7, v1
-; GFX11-NEXT:  v_mov_b32_e32 v8, v1
-; GFX11-NEXT:  v_mov_b32_e32 v9, v1
-; GFX11-NEXT:  v_mov_b32_e32 v10, v1
-; GFX11-NEXT:  v_mov_b32_e32 v11, v1
-; GFX11-NEXT:  v_mov_b32_e32 v12, v1
-; GFX11-NEXT:  v_mov_b32_e32 v13, v1
-; GFX11-NEXT:  v_mov_b32_e32 v14, v1
-; GFX11-NEXT:  v_mov_b32_e32 v15, v1
-; GFX11-NEXT:  v_mov_b32_e32 v16, v1
-; GFX11-NEXT:  v_mov_b32_e32 v17, v1
-; GFX11-NEXT:  v_mov_b32_e32 v18, v1
-; GFX11-NEXT:  v_mov_b32_e32 v19, v1
-; GFX11-NEXT:  v_mov_b32_e32 v20, v1
-; GFX11-NEXT:  v_mov_b32_e32 v21, v1
-; GFX11-NEXT:  v_mov_b32_e32 v22, v1
-; GFX11-NEXT:  v_mov_b32_e32 v23, v1
-; GFX11-NEXT:  v_mov_b32_e32 v24, v1
-; GFX11-NEXT:  v_mov_b32_e32 v25, v1
-; GFX11-NEXT:  v_mov_b32_e32 v26, v1
-; GFX11-NEXT:  v_mov_b32_e32 v27, v1
-; GFX11-NEXT:  v_mov_b32_e32 v28, v1
-; GFX11-NEXT:  v_mov_b32_e32 v29, v1
-; GFX11-NEXT:  v_mov_b32_e32 v30, v1
-; GFX11-NEXT:  v_mov_b32_e32 v31, v1
-; GFX11-NEXT:  s_setpc_b64 s[30:31]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, v1
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
+; GFX11-NEXT:    v_mov_b32_e32 v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v6, v1
+; GFX11-NEXT:    v_mov_b32_e32 v7, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, v1
+; GFX11-NEXT:    v_mov_b32_e32 v9, v1
+; GFX11-NEXT:    v_mov_b32_e32 v10, v1
+; GFX11-NEXT:    v_mov_b32_e32 v11, v1
+; GFX11-NEXT:    v_mov_b32_e32 v12, v1
+; GFX11-NEXT:    v_mov_b32_e32 v13, v1
+; GFX11-NEXT:    v_mov_b32_e32 v14, v1
+; GFX11-NEXT:    v_mov_b32_e32 v15, v1
+; GFX11-NEXT:    v_mov_b32_e32 v16, v1
+; GFX11-NEXT:    v_mov_b32_e32 v17, v1
+; GFX11-NEXT:    v_mov_b32_e32 v18, v1
+; GFX11-NEXT:    v_mov_b32_e32 v19, v1
+; GFX11-NEXT:    v_mov_b32_e32 v20, v1
+; GFX11-NEXT:    v_mov_b32_e32 v21, v1
+; GFX11-NEXT:    v_mov_b32_e32 v22, v1
+; GFX11-NEXT:    v_mov_b32_e32 v23, v1
+; GFX11-NEXT:    v_mov_b32_e32 v24, v1
+; GFX11-NEXT:    v_mov_b32_e32 v25, v1
+; GFX11-NEXT:    v_mov_b32_e32 v26, v1
+; GFX11-NEXT:    v_mov_b32_e32 v27, v1
+; GFX11-NEXT:    v_mov_b32_e32 v28, v1
+; GFX11-NEXT:    v_mov_b32_e32 v29, v1
+; GFX11-NEXT:    v_mov_b32_e32 v30, v1
+; GFX11-NEXT:    v_mov_b32_e32 v31, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %val0 = load <32 x float>, ptr addrspace(1) %arg0
   %val1 = shufflevector <32 x float> %val0, <32 x float> poison, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>

diff  --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index bc8d0d1f8cd8f..58602a1ccd5ba 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GX900 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX942 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define <4 x half> @shuffle_v4f16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
 ; GFX9-LABEL: shuffle_v4f16_23uu:
@@ -205,15 +206,25 @@ define <4 x half> @shuffle_v4f16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_3u6u:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_3u6u:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_3u6u:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, v0, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 poison, i32 6, i32 poison>
@@ -254,15 +265,25 @@ define <4 x half> @shuffle_v4f16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_3uu7:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_3uu7:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_3uu7:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, v0, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 poison, i32 poison, i32 7>
@@ -302,14 +323,26 @@ define <4 x half> @shuffle_v4f16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_35u5:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_35u5:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_35u5:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 poison, i32 5>
@@ -352,16 +385,29 @@ define <4 x half> @shuffle_v4f16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_357u:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_357u:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_357u:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 poison>
@@ -950,15 +996,26 @@ define <4 x half> @shuffle_v4f16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_2356:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_2356:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_2356:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
@@ -999,15 +1056,26 @@ define <4 x half> @shuffle_v4f16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_5623:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_5623:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_5623:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
@@ -1037,16 +1105,29 @@ define <4 x half> @shuffle_v4f16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_3456:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_3456:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_3456:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -1076,16 +1157,29 @@ define <4 x half> @shuffle_v4f16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_5634:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_5634:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_5634:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
@@ -1128,16 +1222,28 @@ define <4 x half> @shuffle_v4f16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_5734:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_5734:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_5734:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
@@ -1178,15 +1284,27 @@ define <4 x i16> @shuffle_v4i16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %a
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4i16_2356:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4i16_2356:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4i16_2356:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x i16>, ptr addrspace(1) %arg0
   %val1 = load <4 x i16>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
@@ -1259,15 +1377,25 @@ define <4 x half> @shuffle_v4f16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_0000:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_0000:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_0000:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer
@@ -1293,15 +1421,26 @@ define <4 x half> @shuffle_v4f16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_1010:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_1010:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_1010:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -1340,14 +1479,23 @@ define <4 x half> @shuffle_v4f16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_1100:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_1100:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_1100:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
@@ -1387,16 +1535,29 @@ define <4 x half> @shuffle_v4f16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_6161:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_6161:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_6161:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
@@ -1430,13 +1591,23 @@ define <4 x half> @shuffle_v4f16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_2333:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_2333:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_2333:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
@@ -1470,13 +1641,23 @@ define <4 x half> @shuffle_v4f16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_6667:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_6667:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_6667:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
@@ -1648,15 +1829,26 @@ define <4 x half> @shuffle_v8f16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v8f16_13_14_2_3:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v8f16_13_14_2_3:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v8f16_13_14_2_3:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x half>, ptr addrspace(1) %arg0
   %val1 = load <8 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
@@ -1690,13 +1882,21 @@ define <4 x half> @shuffle_v3f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v3f16_0122:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v3f16_0122:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v3f16_0122:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <3 x half>, ptr addrspace(1) %arg0
   %val1 = load <3 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
@@ -1720,13 +1920,22 @@ define <4 x half> @shuffle_v2f16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v2f16_0122:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v2f16_0122:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v2f16_0122:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x half>, ptr addrspace(1) %arg0
   %val1 = load <2 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
@@ -1938,15 +2147,26 @@ define <4 x half> @shuffle_v4f16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4f16_0456:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4f16_0456:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4f16_0456:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x half>, ptr addrspace(1) %arg0
   %val1 = load <4 x half>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
@@ -2041,14 +2261,23 @@ define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16bits_v2f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: low16bits_v2f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: low16bits_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
@@ -2087,14 +2316,26 @@ define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16bits_v2f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: hi16bits_v2f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: hi16bits_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
@@ -2168,14 +2409,25 @@ define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16low16bits_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: hi16low16bits_v2bf16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: hi16low16bits_v2bf16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x half>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x half>, ptr addrspace(1) %x1, align 4
@@ -2214,14 +2466,23 @@ define <2 x i16> @i16_low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: i16_low16bits:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: i16_low16bits:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: i16_low16bits:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
@@ -2295,14 +2556,26 @@ define <2 x i16> @i16_hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: i16_hi16low16bits:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: i16_hi16low16bits:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: i16_hi16low16bits:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
@@ -2341,14 +2614,27 @@ define <2 x i16> @i16_hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: i16_hi16bits:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: i16_hi16bits:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: i16_hi16bits:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x i16>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x i16>, ptr addrspace(1) %x1, align 4
@@ -2732,14 +3018,23 @@ define void @shuffle_v4i8_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1,
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4i8_concat:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[4:5], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4i8_concat:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_store_b32 v[4:5], v0, off
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4i8_concat:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_d16_hi_b16 v0, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_store_b32 v[4:5], v0, off
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x i8>, ptr addrspace(1) %arg0
   %val1 = load <2 x i8>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <2 x i8> %val0, <2 x i8> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -3198,15 +3493,25 @@ define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_3u6u:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_3u6u:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_3u6u:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, v0, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 poison, i32 6, i32 poison>
@@ -3247,15 +3552,25 @@ define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_3uu7:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_3uu7:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_3uu7:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, v0, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 poison, i32 poison, i32 7>
@@ -3295,14 +3610,26 @@ define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_35u5:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_35u5:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_35u5:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 poison, i32 5>
@@ -3345,16 +3672,29 @@ define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_perm_b32 v0, v4, v6, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_357u:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, s0, v3, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_357u:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_357u:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, s0, v3, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 poison>
@@ -3943,15 +4283,26 @@ define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_2356:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_2356:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_2356:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
@@ -3992,15 +4343,26 @@ define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_5623:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_5623:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_5623:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
@@ -4030,16 +4392,29 @@ define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_3456:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_3456:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_3456:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -4069,16 +4444,29 @@ define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_5634:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_5634:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_5634:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
@@ -4121,16 +4509,28 @@ define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_5734:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_5734:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v2, v3
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_5734:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v3, v2, 0x7060302
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
@@ -4167,15 +4567,25 @@ define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_0000:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_0000:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_0000:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v0, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> zeroinitializer
@@ -4201,15 +4611,26 @@ define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_1010:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_1010:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_1010:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -4248,14 +4669,23 @@ define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_1100:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_1100:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_1100:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
@@ -4295,16 +4725,29 @@ define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_6161:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_6161:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_6161:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0xffff, v1, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
@@ -4338,13 +4781,23 @@ define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_2333:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_2333:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_2333:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
@@ -4378,13 +4831,23 @@ define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_6667:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_6667:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v1, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_6667:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v0, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
@@ -4556,15 +5019,26 @@ define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrsp
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
@@ -4598,13 +5072,21 @@ define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v3bf16_0122:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v3bf16_0122:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v3bf16_0122:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v1, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <3 x bfloat> %val0, <3 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
@@ -4628,13 +5110,22 @@ define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v2bf16_0122:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v2bf16_0122:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v2bf16_0122:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <2 x bfloat> %val0, <2 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
@@ -4956,100 +5447,198 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fma_shuffle_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10
-; GFX11-NEXT:    s_load_b128 s[4:7], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[0:1]
-; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[4:5]
-; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[6:7]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2
-; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fmac_f32_e32 v0, v8, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_fmac_f32_e32 v4, v2, v5
-; GFX11-NEXT:    v_fmac_f32_e32 v11, v12, v9
-; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
-; GFX11-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v15, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fmac_f32_e32 v1, v3, v10
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX11-NEXT:    v_fmac_f32_e32 v7, v3, v5
-; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
-; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
-; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
-; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fma_shuffle_v2bf16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10
+; GFX11-TRUE16-NEXT:    s_load_b128 s[4:7], s[4:5], 0x0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x2
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v6, s[0:1]
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v6, s[4:5]
+; GFX11-TRUE16-NEXT:    global_load_b64 v[4:5], v6, s[6:7]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v15, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX11-TRUE16-NEXT:    v_dual_fmac_f32 v7, v8, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v8, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v4, v2, v5
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v11, v12, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-TRUE16-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v15, v16, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v2, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v7, v3, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, v5, v8 :: v_dual_fmac_f32 v1, v3, v10
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v7, 16, 1
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v7, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, v9, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0xffff, v2, v1
+; GFX11-TRUE16-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fma_shuffle_v2bf16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10
+; GFX11-FAKE16-NEXT:    s_load_b128 s[4:7], s[4:5], 0x0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x2
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v6, s[0:1]
+; GFX11-FAKE16-NEXT:    global_load_b64 v[2:3], v6, s[4:5]
+; GFX11-FAKE16-NEXT:    global_load_b64 v[4:5], v6, s[6:7]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v0, v8, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v4, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v4, v2, v5
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v11, v12, v9
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v11, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x400000, v11
+; GFX11-FAKE16-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v15, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v1, v3, v10
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v7, v3, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-FAKE16-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_fmac_f32_e32 v0, v2, v10
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x7060302
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v3, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp12 = zext i32 %tmp1 to i64
@@ -5116,15 +5705,26 @@ define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1
 ; GFX10-NEXT:    v_alignbit_b32 v1, v6, v5, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: shuffle_v4bf16_0456:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: shuffle_v4bf16_0456:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: shuffle_v4bf16_0456:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
   %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
   %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
@@ -5161,14 +5761,23 @@ define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16bits:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: low16bits:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: low16bits:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
@@ -5207,14 +5816,26 @@ define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1)
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16bits_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: hi16bits_v2bf16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: hi16bits_v2bf16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
@@ -5253,14 +5874,23 @@ define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1)
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v5, v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16hi16bits_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: low16hi16bits_v2bf16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: low16hi16bits_v2bf16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0xffff, v0, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
@@ -5288,14 +5918,25 @@ define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16low16bits:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: hi16low16bits:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-TRUE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: hi16low16bits:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
   %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir b/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir
index 656c849bbd56b..d07691997d6c1 100644
--- a/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
 
 ---
 
@@ -9,27 +10,51 @@ body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
 
-    ; GCN-LABEL: name: vopc
-    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
-    ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
-    ; GCN-NEXT: V_CMPX_EQ_I16_fake16_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_CMP_CLASS_F16_fake16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_fake16_e64_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit $exec
-    ; GCN-NEXT: [[V_CMP_GE_F16_fake16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_fake16_e64_dpp 1, [[COPY1]], 0, [[COPY]], 1, 1, 15, 15, 1, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
-    ; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp1]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
-    ; GCN-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
-    ; GCN-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
-    ; GCN-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_fake16_e64 0, [[V_CMP_NGE_F16_fake16_e64_]], 0, [[COPY]], 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc
-    ; GCN-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
+    ; GCN-TRUE16-LABEL: name: vopc
+    ; GCN-TRUE16: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GCN-TRUE16-NEXT: {{  $}}
+    ; GCN-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GCN-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-TRUE16-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-TRUE16-NEXT: V_CMPX_EQ_I16_fake16_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_CMP_CLASS_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_fake16_e64 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], implicit-def $vcc_lo, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_CMP_GE_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_fake16_e64 1, [[V_MOV_B32_dpp2]], 0, [[COPY]], 1, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-TRUE16-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp3]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
+    ; GCN-TRUE16-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_fake16_e64 0, [[V_CMP_NGE_F16_fake16_e64_]], 0, [[COPY]], 0, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+    ; GCN-TRUE16-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc
+    ; GCN-TRUE16-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
+    ;
+    ; GCN-FAKE16-LABEL: name: vopc
+    ; GCN-FAKE16: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GCN-FAKE16-NEXT: {{  $}}
+    ; GCN-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-FAKE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GCN-FAKE16-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-FAKE16-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
+    ; GCN-FAKE16-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-FAKE16-NEXT: V_CMPX_EQ_I16_fake16_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec
+    ; GCN-FAKE16-NEXT: [[V_CMP_CLASS_F16_fake16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_fake16_e64_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit $exec
+    ; GCN-FAKE16-NEXT: [[V_CMP_GE_F16_fake16_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_fake16_e64_dpp 1, [[COPY1]], 0, [[COPY]], 1, 1, 15, 15, 1, implicit $mode, implicit $exec
+    ; GCN-FAKE16-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-FAKE16-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp1]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
+    ; GCN-FAKE16-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
+    ; GCN-FAKE16-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
+    ; GCN-FAKE16-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
+    ; GCN-FAKE16-NEXT: [[V_CMP_NGE_F16_fake16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_fake16_e64 0, [[V_CMP_NGE_F16_fake16_e64_]], 0, [[COPY]], 0, implicit $mode, implicit $exec
+    ; GCN-FAKE16-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+    ; GCN-FAKE16-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc
+    ; GCN-FAKE16-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     %2:vgpr_32 = COPY $vgpr2

diff  --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index bac70b69650cd..d41720e19c217 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s
 ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
-; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
 ; SI-LABEL: widen_i16_constant_load:
@@ -257,17 +258,29 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: widen_f16_constant_load:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_f16_e64 v2, s0, 4.0
-; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: widen_f16_constant_load:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_f16_e64 v0.l, s0, 4.0
+; GFX11-TRUE16-NEXT:    global_store_b16 v[1:2], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: widen_f16_constant_load:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_f16_e64 v2, s0, 4.0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-FAKE16-NEXT:    s_endpgm
   %load = load half, ptr addrspace(4) %arg, align 4
   %add = fadd half %load, 4.0
   store half %add, ptr addrspace(1) null
@@ -377,22 +390,37 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: no_widen_i16_constant_divergent_load:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_add_nc_u16 v2, 0x3e7, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_or_b32_e32 v2, 4, v2
-; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: no_widen_i16_constant_divergent_load:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_add_nc_u16 v0.l, 0x3e7, v0.l
+; GFX11-TRUE16-NEXT:    v_or_b16 v0.l, v0.l, 4
+; GFX11-TRUE16-NEXT:    global_store_b16 v[1:2], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: no_widen_i16_constant_divergent_load:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_add_nc_u16 v2, 0x3e7, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v2, 4, v2
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-FAKE16-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
   %gep.arg = getelementptr inbounds i16, ptr addrspace(4) %arg, i64 %tid.ext

diff  --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
index 98da9ef2e8819..436825ed56d45 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX12,GFX12-TRUE16
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX12,GFX12-FAKE16
 
 define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
@@ -366,16 +367,28 @@ bb:
 ; pack f16 elements with v_perm_b32 since they don't come from same b32
 
 define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
-; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
-; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12-TRUE16:       ; %bb.0: ; %bb
+; GFX12-TRUE16-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v11.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v9.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v9, v10
+; GFX12-TRUE16-NEXT:    v_wmma_f16_16x16x16_f16 v[8:9], v[0:1], v[2:3], v[8:9] neg_lo:[0,0,1]
+; GFX12-TRUE16-NEXT:    global_store_b64 v[6:7], v[8:9], off
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX12-FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-FAKE16-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-FAKE16-NEXT:    s_endpgm
 bb:
   %C = load <8 x half>, ptr %Caddr
   %C_shuffle = shufflevector <8 x half> %C, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>


        


More information about the llvm-commits mailing list